xen: features and fixes for 3.18-rc0

- Add pvscsi frontend and backend drivers.
 - Remove _PAGE_IOMAP PTE flag, freeing it for alternate uses.
 - Try and keep memory contiguous during PV memory setup (reduces
   SWIOTLB usage).
 - Allow front/back drivers to use threaded irqs.
 - Support large initrds in PV guests.
 - Fix PVH guests in preparation for Xen 4.5
 -----BEGIN PGP SIGNATURE-----
 Version: GnuPG v1.4.12 (GNU/Linux)
 
 iQEcBAABAgAGBQJUNonmAAoJEFxbo/MsZsTRHAQH/inCjpCT+pkvTB0YAVfVvgMI
 gUogT8G+iB2MuCNpMffGIt8TAVXwcVtnOLH9ABH3IBVehzgipIbIiVEM9YhjrYvU
 1rgIKBpmZqSpjDHoIHpdHeCH67cVnRzA/PyoxZWLxPNmQ0t6bNf9yeAcCXK9PfUc
 7EAblUDmPGSx9x/EUnOKNNaZSEiUJZHDBXbMBLllk1+5H1vfKnpFCRGMG0IrfI44
 KVP2NX9Gfa05edMZYtH887FYyjFe2KNV6LJvE7+w7h2Dy0yIzf7y86t0l4n8gETb
 plvEUJ/lu9RYzTiZY/RxgBFYVTV59EqT45brSUtoe2Jcp8GSwiHslTHdfyFBwSo=
 =gw4d
 -----END PGP SIGNATURE-----

Merge tag 'stable/for-linus-3.18-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip

Pull Xen updates from David Vrabel:
 "Features and fixes:

   - Add pvscsi frontend and backend drivers.
   - Remove _PAGE_IOMAP PTE flag, freeing it for alternate uses.
   - Try and keep memory contiguous during PV memory setup (reduces
     SWIOTLB usage).
   - Allow front/back drivers to use threaded irqs.
   - Support large initrds in PV guests.
   - Fix PVH guests in preparation for Xen 4.5"

* tag 'stable/for-linus-3.18-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip: (22 commits)
  xen: remove DEFINE_XENBUS_DRIVER() macro
  xen/xenbus: Remove BUG_ON() when error string trucated
  xen/xenbus: Correct the comments for xenbus_grant_ring()
  x86/xen: Set EFER.NX and EFER.SCE in PVH guests
  xen: eliminate scalability issues from initrd handling
  xen: sync some headers with xen tree
  xen: make pvscsi frontend dependant on xenbus frontend
  arm{,64}/xen: Remove "EXPERIMENTAL" in the description of the Xen options
  xen-scsifront: don't deadlock if the ring becomes full
  x86: remove the Xen-specific _PAGE_IOMAP PTE flag
  x86/xen: do not use _PAGE_IOMAP PTE flag for I/O mappings
  x86: skip check for spurious faults for non-present faults
  xen/efi: Directly include needed headers
  xen-scsiback: clean up a type issue in scsiback_make_tpg()
  xen-scsifront: use GFP_ATOMIC under spin_lock
  MAINTAINERS: Add xen pvscsi maintainer
  xen-scsiback: Add Xen PV SCSI backend driver
  xen-scsifront: Add Xen PV SCSI frontend driver
  xen: Add Xen pvSCSI protocol description
  xen/events: support threaded irqs for interdomain event channels
  ...
This commit is contained in:
Linus Torvalds 2014-10-11 20:29:01 -04:00
commit 81ae31d782
46 changed files with 4221 additions and 270 deletions

View File

@ -10268,6 +10268,15 @@ S: Supported
F: drivers/block/xen-blkback/*
F: drivers/block/xen*
XEN PVSCSI DRIVERS
M: Juergen Gross <jgross@suse.com>
L: xen-devel@lists.xenproject.org (moderated for non-subscribers)
L: linux-scsi@vger.kernel.org
S: Supported
F: drivers/scsi/xen-scsifront.c
F: drivers/xen/xen-scsiback.c
F: include/xen/interface/io/vscsiif.h
XEN SWIOTLB SUBSYSTEM
M: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
L: xen-devel@lists.xenproject.org (moderated for non-subscribers)

View File

@ -1779,7 +1779,7 @@ config XEN_DOM0
depends on XEN
config XEN
bool "Xen guest support on ARM (EXPERIMENTAL)"
bool "Xen guest support on ARM"
depends on ARM && AEABI && OF
depends on CPU_V7 && !CPU_V6
depends on !GENERIC_ATOMIC64

View File

@ -349,7 +349,7 @@ config XEN_DOM0
depends on XEN
config XEN
bool "Xen guest support on ARM64 (EXPERIMENTAL)"
bool "Xen guest support on ARM64"
depends on ARM64 && OF
select SWIOTLB_XEN
help

View File

@ -23,7 +23,6 @@
#define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1
#define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1
#define _PAGE_BIT_SPLITTING _PAGE_BIT_SOFTW2 /* only valid on a PSE pmd */
#define _PAGE_BIT_IOMAP _PAGE_BIT_SOFTW2 /* flag used to indicate IO mapping */
#define _PAGE_BIT_HIDDEN _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */
#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */
#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
@ -52,7 +51,7 @@
#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
#define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
#define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP)
#define _PAGE_SOFTW2 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2)
#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
@ -168,10 +167,10 @@
#define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE)
#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
#define __PAGE_KERNEL_IO (__PAGE_KERNEL | _PAGE_IOMAP)
#define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE | _PAGE_IOMAP)
#define __PAGE_KERNEL_IO_UC_MINUS (__PAGE_KERNEL_UC_MINUS | _PAGE_IOMAP)
#define __PAGE_KERNEL_IO_WC (__PAGE_KERNEL_WC | _PAGE_IOMAP)
#define __PAGE_KERNEL_IO (__PAGE_KERNEL)
#define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE)
#define __PAGE_KERNEL_IO_UC_MINUS (__PAGE_KERNEL_UC_MINUS)
#define __PAGE_KERNEL_IO_WC (__PAGE_KERNEL_WC)
#define PAGE_KERNEL __pgprot(__PAGE_KERNEL)
#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO)

View File

@ -933,8 +933,17 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
* cross-processor TLB flush, even if no stale TLB entries exist
* on other processors.
*
* Spurious faults may only occur if the TLB contains an entry with
* fewer permission than the page table entry. Non-present (P = 0)
* and reserved bit (R = 1) faults are never spurious.
*
* There are no security implications to leaving a stale TLB when
* increasing the permissions on a page.
*
* Returns non-zero if a spurious fault was handled, zero otherwise.
*
* See Intel Developer's Manual Vol 3 Section 4.10.4.3, bullet 3
* (Optional Invalidation).
*/
static noinline int
spurious_fault(unsigned long error_code, unsigned long address)
@ -945,8 +954,17 @@ spurious_fault(unsigned long error_code, unsigned long address)
pte_t *pte;
int ret;
/* Reserved-bit violation or user access to kernel space? */
if (error_code & (PF_USER | PF_RSVD))
/*
* Only writes to RO or instruction fetches from NX may cause
* spurious faults.
*
* These could be from user or supervisor accesses but the TLB
* is only lazily flushed after a kernel mapping protection
* change, so user accesses are not expected to cause spurious
* faults.
*/
if (error_code != (PF_WRITE | PF_PROT)
&& error_code != (PF_INSTR | PF_PROT))
return 0;
pgd = init_mm.pgd + pgd_index(address);

View File

@ -537,7 +537,7 @@ static void __init pagetable_init(void)
permanent_kmaps_init(pgd_base);
}
pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP);
pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL);
EXPORT_SYMBOL_GPL(__supported_pte_mask);
/* user-defined highmem size */

View File

@ -151,7 +151,7 @@ early_param("gbpages", parse_direct_gbpages_on);
* around without checking the pgd every time.
*/
pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
pteval_t __supported_pte_mask __read_mostly = ~0;
EXPORT_SYMBOL_GPL(__supported_pte_mask);
int force_personality32;

View File

@ -442,8 +442,6 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
*/
prot |= _PAGE_CACHE_UC_MINUS;
prot |= _PAGE_IOMAP; /* creating a mapping for IO */
vma->vm_page_prot = __pgprot(prot);
if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,

View File

@ -15,12 +15,14 @@
* with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <linux/bitops.h>
#include <linux/efi.h>
#include <linux/init.h>
#include <linux/string.h>
#include <xen/xen-ops.h>
#include <asm/page.h>
#include <asm/setup.h>
void __init xen_efi_init(void)

View File

@ -1463,6 +1463,7 @@ static void __ref xen_setup_gdt(int cpu)
pv_cpu_ops.load_gdt = xen_load_gdt;
}
#ifdef CONFIG_XEN_PVH
/*
* A PV guest starts with default flags that are not set for PVH, set them
* here asap.
@ -1508,17 +1509,21 @@ static void __init xen_pvh_early_guest_init(void)
return;
xen_have_vector_callback = 1;
xen_pvh_early_cpu_init(0, false);
xen_pvh_set_cr_flags(0);
#ifdef CONFIG_X86_32
BUG(); /* PVH: Implement proper support. */
#endif
}
#endif /* CONFIG_XEN_PVH */
/* First C function to be called on Xen boot */
asmlinkage __visible void __init xen_start_kernel(void)
{
struct physdev_set_iopl set_iopl;
unsigned long initrd_start = 0;
int rc;
if (!xen_start_info)
@ -1527,7 +1532,9 @@ asmlinkage __visible void __init xen_start_kernel(void)
xen_domain_type = XEN_PV_DOMAIN;
xen_setup_features();
#ifdef CONFIG_XEN_PVH
xen_pvh_early_guest_init();
#endif
xen_setup_machphys_mapping();
/* Install Xen paravirt ops */
@ -1559,8 +1566,6 @@ asmlinkage __visible void __init xen_start_kernel(void)
#endif
__supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
__supported_pte_mask |= _PAGE_IOMAP;
/*
* Prevent page tables from being allocated in highmem, even
* if CONFIG_HIGHPTE is enabled.
@ -1667,10 +1672,16 @@ asmlinkage __visible void __init xen_start_kernel(void)
new_cpu_data.x86_capability[0] = cpuid_edx(1);
#endif
if (xen_start_info->mod_start) {
if (xen_start_info->flags & SIF_MOD_START_PFN)
initrd_start = PFN_PHYS(xen_start_info->mod_start);
else
initrd_start = __pa(xen_start_info->mod_start);
}
/* Poke various useful things into boot_params */
boot_params.hdr.type_of_loader = (9 << 4) | 0;
boot_params.hdr.ramdisk_image = xen_start_info->mod_start
? __pa(xen_start_info->mod_start) : 0;
boot_params.hdr.ramdisk_image = initrd_start;
boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line);

View File

@ -399,38 +399,14 @@ static pteval_t pte_pfn_to_mfn(pteval_t val)
if (unlikely(mfn == INVALID_P2M_ENTRY)) {
mfn = 0;
flags = 0;
} else {
/*
* Paramount to do this test _after_ the
* INVALID_P2M_ENTRY as INVALID_P2M_ENTRY &
* IDENTITY_FRAME_BIT resolves to true.
*/
mfn &= ~FOREIGN_FRAME_BIT;
if (mfn & IDENTITY_FRAME_BIT) {
mfn &= ~IDENTITY_FRAME_BIT;
flags |= _PAGE_IOMAP;
}
}
} else
mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT);
val = ((pteval_t)mfn << PAGE_SHIFT) | flags;
}
return val;
}
static pteval_t iomap_pte(pteval_t val)
{
if (val & _PAGE_PRESENT) {
unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
pteval_t flags = val & PTE_FLAGS_MASK;
/* We assume the pte frame number is a MFN, so
just use it as-is. */
val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
}
return val;
}
__visible pteval_t xen_pte_val(pte_t pte)
{
pteval_t pteval = pte.pte;
@ -441,9 +417,6 @@ __visible pteval_t xen_pte_val(pte_t pte)
pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT;
}
#endif
if (xen_initial_domain() && (pteval & _PAGE_IOMAP))
return pteval;
return pte_mfn_to_pfn(pteval);
}
PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
@ -481,7 +454,6 @@ void xen_set_pat(u64 pat)
__visible pte_t xen_make_pte(pteval_t pte)
{
phys_addr_t addr = (pte & PTE_PFN_MASK);
#if 0
/* If Linux is trying to set a WC pte, then map to the Xen WC.
* If _PAGE_PAT is set, then it probably means it is really
@ -496,19 +468,7 @@ __visible pte_t xen_make_pte(pteval_t pte)
pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT;
}
#endif
/*
* Unprivileged domains are allowed to do IOMAPpings for
* PCI passthrough, but not map ISA space. The ISA
* mappings are just dummy local mappings to keep other
* parts of the kernel happy.
*/
if (unlikely(pte & _PAGE_IOMAP) &&
(xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
pte = iomap_pte(pte);
} else {
pte &= ~_PAGE_IOMAP;
pte = pte_pfn_to_mfn(pte);
}
pte = pte_pfn_to_mfn(pte);
return native_make_pte(pte);
}
@ -2091,7 +2051,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
default:
/* By default, set_fixmap is used for hardware mappings */
pte = mfn_pte(phys, __pgprot(pgprot_val(prot) | _PAGE_IOMAP));
pte = mfn_pte(phys, prot);
break;
}

View File

@ -173,6 +173,7 @@
#include <xen/balloon.h>
#include <xen/grant_table.h>
#include "p2m.h"
#include "multicalls.h"
#include "xen-ops.h"
@ -180,12 +181,6 @@ static void __init m2p_override_init(void);
unsigned long xen_max_p2m_pfn __read_mostly;
#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **))
#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
/* Placeholders for holes in the address space */
static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
@ -202,16 +197,12 @@ static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_identity_mfn, P2M_MID_PER_PAGE);
RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
/* We might hit two boundary violations at the start and end, at max each
* boundary violation will require three middle nodes. */
RESERVE_BRK(p2m_mid_extra, PAGE_SIZE * 2 * 3);
/* When we populate back during bootup, the amount of pages can vary. The
* max we have is seen is 395979, but that does not mean it can't be more.
* Some machines can have 3GB I/O holes even. With early_can_reuse_p2m_middle
* it can re-use Xen provided mfn_list array, so we only need to allocate at
* most three P2M top nodes. */
RESERVE_BRK(p2m_populated, PAGE_SIZE * 3);
/* For each I/O range remapped we may lose up to two leaf pages for the boundary
* violations and three mid pages to cover up to 3GB. With
* early_can_reuse_p2m_middle() most of the leaf pages will be reused by the
* remapped region.
*/
RESERVE_BRK(p2m_identity_remap, PAGE_SIZE * 2 * 3 * MAX_REMAP_RANGES);
static inline unsigned p2m_top_index(unsigned long pfn)
{

15
arch/x86/xen/p2m.h Normal file
View File

@ -0,0 +1,15 @@
#ifndef _XEN_P2M_H
#define _XEN_P2M_H
#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **))
#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
#define MAX_REMAP_RANGES 10
extern unsigned long __init set_phys_range_identity(unsigned long pfn_s,
unsigned long pfn_e);
#endif /* _XEN_P2M_H */

View File

@ -29,6 +29,7 @@
#include <xen/features.h>
#include "xen-ops.h"
#include "vdso.h"
#include "p2m.h"
/* These are code, but not functions. Defined in entry.S */
extern const char xen_hypervisor_callback[];
@ -46,6 +47,9 @@ struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
/* Number of pages released from the initial allocation. */
unsigned long xen_released_pages;
/* Buffer used to remap identity mapped pages */
unsigned long xen_remap_buf[P2M_PER_PAGE] __initdata;
/*
* The maximum amount of extra memory compared to the base size. The
* main scaling factor is the size of struct page. At extreme ratios
@ -151,107 +155,325 @@ static unsigned long __init xen_do_chunk(unsigned long start,
return len;
}
static unsigned long __init xen_release_chunk(unsigned long start,
unsigned long end)
{
return xen_do_chunk(start, end, true);
}
static unsigned long __init xen_populate_chunk(
/*
* Finds the next RAM pfn available in the E820 map after min_pfn.
* This function updates min_pfn with the pfn found and returns
* the size of that range or zero if not found.
*/
static unsigned long __init xen_find_pfn_range(
const struct e820entry *list, size_t map_size,
unsigned long max_pfn, unsigned long *last_pfn,
unsigned long credits_left)
unsigned long *min_pfn)
{
const struct e820entry *entry;
unsigned int i;
unsigned long done = 0;
unsigned long dest_pfn;
for (i = 0, entry = list; i < map_size; i++, entry++) {
unsigned long s_pfn;
unsigned long e_pfn;
unsigned long pfns;
long capacity;
if (credits_left <= 0)
break;
if (entry->type != E820_RAM)
continue;
e_pfn = PFN_DOWN(entry->addr + entry->size);
/* We only care about E820 after the xen_start_info->nr_pages */
if (e_pfn <= max_pfn)
/* We only care about E820 after this */
if (e_pfn < *min_pfn)
continue;
s_pfn = PFN_UP(entry->addr);
/* If the E820 falls within the nr_pages, we want to start
* at the nr_pages PFN.
* If that would mean going past the E820 entry, skip it
/* If min_pfn falls within the E820 entry, we want to start
* at the min_pfn PFN.
*/
if (s_pfn <= max_pfn) {
capacity = e_pfn - max_pfn;
dest_pfn = max_pfn;
if (s_pfn <= *min_pfn) {
done = e_pfn - *min_pfn;
} else {
capacity = e_pfn - s_pfn;
dest_pfn = s_pfn;
done = e_pfn - s_pfn;
*min_pfn = s_pfn;
}
if (credits_left < capacity)
capacity = credits_left;
pfns = xen_do_chunk(dest_pfn, dest_pfn + capacity, false);
done += pfns;
*last_pfn = (dest_pfn + pfns);
if (pfns < capacity)
break;
credits_left -= pfns;
break;
}
return done;
}
static void __init xen_set_identity_and_release_chunk(
unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages,
unsigned long *released, unsigned long *identity)
/*
* This releases a chunk of memory and then does the identity map. It's used as
* as a fallback if the remapping fails.
*/
static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn,
unsigned long end_pfn, unsigned long nr_pages, unsigned long *identity,
unsigned long *released)
{
unsigned long pfn;
/*
* If the PFNs are currently mapped, clear the mappings
* (except for the ISA region which must be 1:1 mapped) to
* release the refcounts (in Xen) on the original frames.
*/
for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) {
pte_t pte = __pte_ma(0);
if (pfn < PFN_UP(ISA_END_ADDRESS))
pte = mfn_pte(pfn, PAGE_KERNEL_IO);
(void)HYPERVISOR_update_va_mapping(
(unsigned long)__va(pfn << PAGE_SHIFT), pte, 0);
}
if (start_pfn < nr_pages)
*released += xen_release_chunk(
start_pfn, min(end_pfn, nr_pages));
WARN_ON(start_pfn > end_pfn);
/* Need to release pages first */
*released += xen_do_chunk(start_pfn, min(end_pfn, nr_pages), true);
*identity += set_phys_range_identity(start_pfn, end_pfn);
}
static unsigned long __init xen_set_identity_and_release(
const struct e820entry *list, size_t map_size, unsigned long nr_pages)
/*
* Helper function to update both the p2m and m2p tables.
*/
static unsigned long __init xen_update_mem_tables(unsigned long pfn,
unsigned long mfn)
{
struct mmu_update update = {
.ptr = ((unsigned long long)mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE,
.val = pfn
};
/* Update p2m */
if (!early_set_phys_to_machine(pfn, mfn)) {
WARN(1, "Failed to set p2m mapping for pfn=%ld mfn=%ld\n",
pfn, mfn);
return false;
}
/* Update m2p */
if (HYPERVISOR_mmu_update(&update, 1, NULL, DOMID_SELF) < 0) {
WARN(1, "Failed to set m2p mapping for mfn=%ld pfn=%ld\n",
mfn, pfn);
return false;
}
return true;
}
/*
* This function updates the p2m and m2p tables with an identity map from
* start_pfn to start_pfn+size and remaps the underlying RAM of the original
* allocation at remap_pfn. It must do so carefully in P2M_PER_PAGE sized blocks
* to not exhaust the reserved brk space. Doing it in properly aligned blocks
* ensures we only allocate the minimum required leaf pages in the p2m table. It
* copies the existing mfns from the p2m table under the 1:1 map, overwrites
* them with the identity map and then updates the p2m and m2p tables with the
* remapped memory.
*/
static unsigned long __init xen_do_set_identity_and_remap_chunk(
unsigned long start_pfn, unsigned long size, unsigned long remap_pfn)
{
unsigned long ident_pfn_iter, remap_pfn_iter;
unsigned long ident_start_pfn_align, remap_start_pfn_align;
unsigned long ident_end_pfn_align, remap_end_pfn_align;
unsigned long ident_boundary_pfn, remap_boundary_pfn;
unsigned long ident_cnt = 0;
unsigned long remap_cnt = 0;
unsigned long left = size;
unsigned long mod;
int i;
WARN_ON(size == 0);
BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
/*
* Determine the proper alignment to remap memory in P2M_PER_PAGE sized
* blocks. We need to keep track of both the existing pfn mapping and
* the new pfn remapping.
*/
mod = start_pfn % P2M_PER_PAGE;
ident_start_pfn_align =
mod ? (start_pfn - mod + P2M_PER_PAGE) : start_pfn;
mod = remap_pfn % P2M_PER_PAGE;
remap_start_pfn_align =
mod ? (remap_pfn - mod + P2M_PER_PAGE) : remap_pfn;
mod = (start_pfn + size) % P2M_PER_PAGE;
ident_end_pfn_align = start_pfn + size - mod;
mod = (remap_pfn + size) % P2M_PER_PAGE;
remap_end_pfn_align = remap_pfn + size - mod;
/* Iterate over each p2m leaf node in each range */
for (ident_pfn_iter = ident_start_pfn_align, remap_pfn_iter = remap_start_pfn_align;
ident_pfn_iter < ident_end_pfn_align && remap_pfn_iter < remap_end_pfn_align;
ident_pfn_iter += P2M_PER_PAGE, remap_pfn_iter += P2M_PER_PAGE) {
/* Check we aren't past the end */
BUG_ON(ident_pfn_iter + P2M_PER_PAGE > start_pfn + size);
BUG_ON(remap_pfn_iter + P2M_PER_PAGE > remap_pfn + size);
/* Save p2m mappings */
for (i = 0; i < P2M_PER_PAGE; i++)
xen_remap_buf[i] = pfn_to_mfn(ident_pfn_iter + i);
/* Set identity map which will free a p2m leaf */
ident_cnt += set_phys_range_identity(ident_pfn_iter,
ident_pfn_iter + P2M_PER_PAGE);
#ifdef DEBUG
/* Helps verify a p2m leaf has been freed */
for (i = 0; i < P2M_PER_PAGE; i++) {
unsigned int pfn = ident_pfn_iter + i;
BUG_ON(pfn_to_mfn(pfn) != pfn);
}
#endif
/* Now remap memory */
for (i = 0; i < P2M_PER_PAGE; i++) {
unsigned long mfn = xen_remap_buf[i];
/* This will use the p2m leaf freed above */
if (!xen_update_mem_tables(remap_pfn_iter + i, mfn)) {
WARN(1, "Failed to update mem mapping for pfn=%ld mfn=%ld\n",
remap_pfn_iter + i, mfn);
return 0;
}
remap_cnt++;
}
left -= P2M_PER_PAGE;
}
/* Max boundary space possible */
BUG_ON(left > (P2M_PER_PAGE - 1) * 2);
/* Now handle the boundary conditions */
ident_boundary_pfn = start_pfn;
remap_boundary_pfn = remap_pfn;
for (i = 0; i < left; i++) {
unsigned long mfn;
/* These two checks move from the start to end boundaries */
if (ident_boundary_pfn == ident_start_pfn_align)
ident_boundary_pfn = ident_pfn_iter;
if (remap_boundary_pfn == remap_start_pfn_align)
remap_boundary_pfn = remap_pfn_iter;
/* Check we aren't past the end */
BUG_ON(ident_boundary_pfn >= start_pfn + size);
BUG_ON(remap_boundary_pfn >= remap_pfn + size);
mfn = pfn_to_mfn(ident_boundary_pfn);
if (!xen_update_mem_tables(remap_boundary_pfn, mfn)) {
WARN(1, "Failed to update mem mapping for pfn=%ld mfn=%ld\n",
remap_pfn_iter + i, mfn);
return 0;
}
remap_cnt++;
ident_boundary_pfn++;
remap_boundary_pfn++;
}
/* Finish up the identity map */
if (ident_start_pfn_align >= ident_end_pfn_align) {
/*
* In this case we have an identity range which does not span an
* aligned block so everything needs to be identity mapped here.
* If we didn't check this we might remap too many pages since
* the align boundaries are not meaningful in this case.
*/
ident_cnt += set_phys_range_identity(start_pfn,
start_pfn + size);
} else {
/* Remapped above so check each end of the chunk */
if (start_pfn < ident_start_pfn_align)
ident_cnt += set_phys_range_identity(start_pfn,
ident_start_pfn_align);
if (start_pfn + size > ident_pfn_iter)
ident_cnt += set_phys_range_identity(ident_pfn_iter,
start_pfn + size);
}
BUG_ON(ident_cnt != size);
BUG_ON(remap_cnt != size);
return size;
}
/*
* This function takes a contiguous pfn range that needs to be identity mapped
* and:
*
* 1) Finds a new range of pfns to use to remap based on E820 and remap_pfn.
* 2) Calls the do_ function to actually do the mapping/remapping work.
*
* The goal is to not allocate additional memory but to remap the existing
* pages. In the case of an error the underlying memory is simply released back
* to Xen and not remapped.
*/
static unsigned long __init xen_set_identity_and_remap_chunk(
const struct e820entry *list, size_t map_size, unsigned long start_pfn,
unsigned long end_pfn, unsigned long nr_pages, unsigned long remap_pfn,
unsigned long *identity, unsigned long *remapped,
unsigned long *released)
{
unsigned long pfn;
unsigned long i = 0;
unsigned long n = end_pfn - start_pfn;
while (i < n) {
unsigned long cur_pfn = start_pfn + i;
unsigned long left = n - i;
unsigned long size = left;
unsigned long remap_range_size;
/* Do not remap pages beyond the current allocation */
if (cur_pfn >= nr_pages) {
/* Identity map remaining pages */
*identity += set_phys_range_identity(cur_pfn,
cur_pfn + size);
break;
}
if (cur_pfn + size > nr_pages)
size = nr_pages - cur_pfn;
remap_range_size = xen_find_pfn_range(list, map_size,
&remap_pfn);
if (!remap_range_size) {
pr_warning("Unable to find available pfn range, not remapping identity pages\n");
xen_set_identity_and_release_chunk(cur_pfn,
cur_pfn + left, nr_pages, identity, released);
break;
}
/* Adjust size to fit in current e820 RAM region */
if (size > remap_range_size)
size = remap_range_size;
if (!xen_do_set_identity_and_remap_chunk(cur_pfn, size, remap_pfn)) {
WARN(1, "Failed to remap 1:1 memory cur_pfn=%ld size=%ld remap_pfn=%ld\n",
cur_pfn, size, remap_pfn);
xen_set_identity_and_release_chunk(cur_pfn,
cur_pfn + left, nr_pages, identity, released);
break;
}
/* Update variables to reflect new mappings. */
i += size;
remap_pfn += size;
*identity += size;
*remapped += size;
}
/*
* If the PFNs are currently mapped, the VA mapping also needs
* to be updated to be 1:1.
*/
for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++)
(void)HYPERVISOR_update_va_mapping(
(unsigned long)__va(pfn << PAGE_SHIFT),
mfn_pte(pfn, PAGE_KERNEL_IO), 0);
return remap_pfn;
}
static unsigned long __init xen_set_identity_and_remap(
const struct e820entry *list, size_t map_size, unsigned long nr_pages,
unsigned long *released)
{
phys_addr_t start = 0;
unsigned long released = 0;
unsigned long identity = 0;
unsigned long remapped = 0;
unsigned long last_pfn = nr_pages;
const struct e820entry *entry;
unsigned long num_released = 0;
int i;
/*
* Combine non-RAM regions and gaps until a RAM region (or the
* end of the map) is reached, then set the 1:1 map and
* release the pages (if available) in those non-RAM regions.
* remap the memory in those non-RAM regions.
*
* The combined non-RAM regions are rounded to a whole number
* of pages so any partial pages are accessible via the 1:1
@ -269,22 +491,24 @@ static unsigned long __init xen_set_identity_and_release(
end_pfn = PFN_UP(entry->addr);
if (start_pfn < end_pfn)
xen_set_identity_and_release_chunk(
start_pfn, end_pfn, nr_pages,
&released, &identity);
last_pfn = xen_set_identity_and_remap_chunk(
list, map_size, start_pfn,
end_pfn, nr_pages, last_pfn,
&identity, &remapped,
&num_released);
start = end;
}
}
if (released)
printk(KERN_INFO "Released %lu pages of unused memory\n", released);
if (identity)
printk(KERN_INFO "Set %ld page(s) to 1-1 mapping\n", identity);
*released = num_released;
return released;
pr_info("Set %ld page(s) to 1-1 mapping\n", identity);
pr_info("Remapped %ld page(s), last_pfn=%ld\n", remapped,
last_pfn);
pr_info("Released %ld page(s)\n", num_released);
return last_pfn;
}
static unsigned long __init xen_get_max_pages(void)
{
unsigned long max_pages = MAX_DOMAIN_PAGES;
@ -347,7 +571,6 @@ char * __init xen_memory_setup(void)
unsigned long max_pages;
unsigned long last_pfn = 0;
unsigned long extra_pages = 0;
unsigned long populated;
int i;
int op;
@ -392,20 +615,11 @@ char * __init xen_memory_setup(void)
extra_pages += max_pages - max_pfn;
/*
* Set P2M for all non-RAM pages and E820 gaps to be identity
* type PFNs. Any RAM pages that would be made inaccesible by
* this are first released.
* Set identity map on non-RAM pages and remap the underlying RAM.
*/
xen_released_pages = xen_set_identity_and_release(
map, memmap.nr_entries, max_pfn);
last_pfn = xen_set_identity_and_remap(map, memmap.nr_entries, max_pfn,
&xen_released_pages);
/*
* Populate back the non-RAM pages and E820 gaps that had been
* released. */
populated = xen_populate_chunk(map, memmap.nr_entries,
max_pfn, &last_pfn, xen_released_pages);
xen_released_pages -= populated;
extra_pages += xen_released_pages;
if (last_pfn > max_pfn) {

View File

@ -37,6 +37,7 @@
#include <xen/hvc-console.h>
#include "xen-ops.h"
#include "mmu.h"
#include "smp.h"
cpumask_var_t xen_cpu_initialized_map;
@ -99,10 +100,14 @@ static void cpu_bringup(void)
wmb(); /* make sure everything is out */
}
/* Note: cpu parameter is only relevant for PVH */
static void cpu_bringup_and_idle(int cpu)
/*
* Note: cpu parameter is only relevant for PVH. The reason for passing it
* is we can't do smp_processor_id until the percpu segments are loaded, for
* which we need the cpu number! So we pass it in rdi as first parameter.
*/
asmlinkage __visible void cpu_bringup_and_idle(int cpu)
{
#ifdef CONFIG_X86_64
#ifdef CONFIG_XEN_PVH
if (xen_feature(XENFEAT_auto_translated_physmap) &&
xen_feature(XENFEAT_supervisor_mode_kernel))
xen_pvh_secondary_vcpu_init(cpu);
@ -374,11 +379,10 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
ctxt->user_regs.fs = __KERNEL_PERCPU;
ctxt->user_regs.gs = __KERNEL_STACK_CANARY;
#endif
ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
if (!xen_feature(XENFEAT_auto_translated_physmap)) {
ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
ctxt->flags = VGCF_IN_KERNEL;
ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
ctxt->user_regs.ds = __USER_DS;
@ -413,15 +417,18 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
(unsigned long)xen_failsafe_callback;
ctxt->user_regs.cs = __KERNEL_CS;
per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
#ifdef CONFIG_X86_32
}
#else
} else
/* N.B. The user_regs.eip (cpu_bringup_and_idle) is called with
* %rdi having the cpu number - which means are passing in
* as the first parameter the cpu. Subtle!
#ifdef CONFIG_XEN_PVH
else {
/*
* The vcpu comes on kernel page tables which have the NX pte
* bit set. This means before DS/SS is touched, NX in
* EFER must be set. Hence the following assembly glue code.
*/
ctxt->user_regs.eip = (unsigned long)xen_pvh_early_cpu_init;
ctxt->user_regs.rdi = cpu;
ctxt->user_regs.rsi = true; /* entry == true */
}
#endif
ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));

View File

@ -8,4 +8,12 @@ extern void xen_send_IPI_allbutself(int vector);
extern void xen_send_IPI_all(int vector);
extern void xen_send_IPI_self(int vector);
#ifdef CONFIG_XEN_PVH
extern void xen_pvh_early_cpu_init(int cpu, bool entry);
#else
static inline void xen_pvh_early_cpu_init(int cpu, bool entry)
{
}
#endif
#endif

View File

@ -47,6 +47,41 @@ ENTRY(startup_xen)
__FINIT
#ifdef CONFIG_XEN_PVH
/*
* xen_pvh_early_cpu_init() - early PVH VCPU initialization
* @cpu: this cpu number (%rdi)
* @entry: true if this is a secondary vcpu coming up on this entry
* point, false if this is the boot CPU being initialized for
* the first time (%rsi)
*
* Note: This is called as a function on the boot CPU, and is the entry point
* on the secondary CPU.
*/
ENTRY(xen_pvh_early_cpu_init)
mov %rsi, %r11
/* Gather features to see if NX implemented. */
mov $0x80000001, %eax
cpuid
mov %edx, %esi
mov $MSR_EFER, %ecx
rdmsr
bts $_EFER_SCE, %eax
bt $20, %esi
jnc 1f /* No NX, skip setting it */
bts $_EFER_NX, %eax
1: wrmsr
#ifdef CONFIG_SMP
cmp $0, %r11b
jne cpu_bringup_and_idle
#endif
ret
#endif /* CONFIG_XEN_PVH */
.pushsection .text
.balign PAGE_SIZE
ENTRY(hypercall_page)
@ -124,6 +159,7 @@ NEXT_HYPERCALL(arch_6)
ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,
.quad _PAGE_PRESENT; .quad _PAGE_PRESENT)
ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
ELFNOTE(Xen, XEN_ELFNOTE_MOD_START_PFN, .long 1)
ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, _ASM_PTR __HYPERVISOR_VIRT_START)
ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, _ASM_PTR 0)

View File

@ -907,22 +907,17 @@ static int connect_ring(struct backend_info *be)
return 0;
}
/* ** Driver Registration ** */
static const struct xenbus_device_id xen_blkbk_ids[] = {
{ "vbd" },
{ "" }
};
static DEFINE_XENBUS_DRIVER(xen_blkbk, ,
static struct xenbus_driver xen_blkbk_driver = {
.ids = xen_blkbk_ids,
.probe = xen_blkbk_probe,
.remove = xen_blkbk_remove,
.otherend_changed = frontend_changed
);
};
int xen_blkif_xenbus_init(void)
{

View File

@ -2055,13 +2055,14 @@ static const struct xenbus_device_id blkfront_ids[] = {
{ "" }
};
static DEFINE_XENBUS_DRIVER(blkfront, ,
static struct xenbus_driver blkfront_driver = {
.ids = blkfront_ids,
.probe = blkfront_probe,
.remove = blkfront_remove,
.resume = blkfront_resume,
.otherend_changed = blkback_changed,
.is_ready = blkfront_is_ready,
);
};
static int __init xlblk_init(void)
{

View File

@ -367,12 +367,13 @@ static const struct xenbus_device_id tpmfront_ids[] = {
};
MODULE_ALIAS("xen:vtpm");
static DEFINE_XENBUS_DRIVER(tpmfront, ,
.probe = tpmfront_probe,
.remove = tpmfront_remove,
.resume = tpmfront_resume,
.otherend_changed = backend_changed,
);
static struct xenbus_driver tpmfront_driver = {
.ids = tpmfront_ids,
.probe = tpmfront_probe,
.remove = tpmfront_remove,
.resume = tpmfront_resume,
.otherend_changed = backend_changed,
};
static int __init xen_tpmfront_init(void)
{

View File

@ -365,12 +365,13 @@ static const struct xenbus_device_id xenkbd_ids[] = {
{ "" }
};
static DEFINE_XENBUS_DRIVER(xenkbd, ,
static struct xenbus_driver xenkbd_driver = {
.ids = xenkbd_ids,
.probe = xenkbd_probe,
.remove = xenkbd_remove,
.resume = xenkbd_resume,
.otherend_changed = xenkbd_backend_changed,
);
};
static int __init xenkbd_init(void)
{

View File

@ -937,22 +937,18 @@ static int read_xenbus_vif_flags(struct backend_info *be)
return 0;
}
/* ** Driver Registration ** */
static const struct xenbus_device_id netback_ids[] = {
{ "vif" },
{ "" }
};
static DEFINE_XENBUS_DRIVER(netback, ,
static struct xenbus_driver netback_driver = {
.ids = netback_ids,
.probe = netback_probe,
.remove = netback_remove,
.uevent = netback_uevent,
.otherend_changed = frontend_changed,
);
};
int xenvif_xenbus_init(void)
{

View File

@ -2300,12 +2300,6 @@ static void xennet_sysfs_delif(struct net_device *netdev)
#endif /* CONFIG_SYSFS */
static const struct xenbus_device_id netfront_ids[] = {
{ "vif" },
{ "" }
};
static int xennet_remove(struct xenbus_device *dev)
{
struct netfront_info *info = dev_get_drvdata(&dev->dev);
@ -2338,12 +2332,18 @@ static int xennet_remove(struct xenbus_device *dev)
return 0;
}
static DEFINE_XENBUS_DRIVER(netfront, ,
static const struct xenbus_device_id netfront_ids[] = {
{ "vif" },
{ "" }
};
static struct xenbus_driver netfront_driver = {
.ids = netfront_ids,
.probe = netfront_probe,
.remove = xennet_remove,
.resume = netfront_resume,
.otherend_changed = netback_changed,
);
};
static int __init netif_init(void)
{

View File

@ -1136,11 +1136,13 @@ static const struct xenbus_device_id xenpci_ids[] = {
{""},
};
static DEFINE_XENBUS_DRIVER(xenpci, "pcifront",
static struct xenbus_driver xenpci_driver = {
.name = "pcifront",
.ids = xenpci_ids,
.probe = pcifront_xenbus_probe,
.remove = pcifront_xenbus_remove,
.otherend_changed = pcifront_backend_changed,
);
};
static int __init pcifront_init(void)
{

View File

@ -587,6 +587,16 @@ config VMWARE_PVSCSI
To compile this driver as a module, choose M here: the
module will be called vmw_pvscsi.
config XEN_SCSI_FRONTEND
tristate "XEN SCSI frontend driver"
depends on SCSI && XEN
select XEN_XENBUS_FRONTEND
help
The XEN SCSI frontend driver allows the kernel to access SCSI Devices
within another guest OS (usually Dom0).
Only needed if the kernel is running in a XEN guest and generic
SCSI access to a device is needed.
config HYPERV_STORAGE
tristate "Microsoft Hyper-V virtual storage driver"
depends on SCSI && HYPERV

View File

@ -141,6 +141,7 @@ obj-$(CONFIG_SCSI_ESAS2R) += esas2r/
obj-$(CONFIG_SCSI_PMCRAID) += pmcraid.o
obj-$(CONFIG_SCSI_VIRTIO) += virtio_scsi.o
obj-$(CONFIG_VMWARE_PVSCSI) += vmw_pvscsi.o
obj-$(CONFIG_XEN_SCSI_FRONTEND) += xen-scsifront.o
obj-$(CONFIG_HYPERV_STORAGE) += hv_storvsc.o
obj-$(CONFIG_ARM) += arm/

1026
drivers/scsi/xen-scsifront.c Normal file

File diff suppressed because it is too large Load Diff

View File

@ -347,8 +347,6 @@ static int xen_console_remove(struct xencons_info *info)
}
#ifdef CONFIG_HVC_XEN_FRONTEND
static struct xenbus_driver xencons_driver;
static int xencons_remove(struct xenbus_device *dev)
{
return xen_console_remove(dev_get_drvdata(&dev->dev));
@ -499,13 +497,14 @@ static const struct xenbus_device_id xencons_ids[] = {
{ "" }
};
static DEFINE_XENBUS_DRIVER(xencons, "xenconsole",
static struct xenbus_driver xencons_driver = {
.name = "xenconsole",
.ids = xencons_ids,
.probe = xencons_probe,
.remove = xencons_remove,
.resume = xencons_resume,
.otherend_changed = xencons_backend_changed,
);
};
#endif /* CONFIG_HVC_XEN_FRONTEND */
static int __init xen_hvc_init(void)

View File

@ -684,12 +684,13 @@ static const struct xenbus_device_id xenfb_ids[] = {
{ "" }
};
static DEFINE_XENBUS_DRIVER(xenfb, ,
static struct xenbus_driver xenfb_driver = {
.ids = xenfb_ids,
.probe = xenfb_probe,
.remove = xenfb_remove,
.resume = xenfb_resume,
.otherend_changed = xenfb_backend_changed,
);
};
static int __init xenfb_init(void)
{

View File

@ -172,6 +172,15 @@ config XEN_PCIDEV_BACKEND
If in doubt, say m.
config XEN_SCSI_BACKEND
tristate "XEN SCSI backend driver"
depends on XEN && XEN_BACKEND && TARGET_CORE
help
The SCSI backend driver allows the kernel to export its SCSI Devices
to other guests via a high-performance shared-memory interface.
Only needed for systems running as XEN driver domains (e.g. Dom0) and
if guests need generic access to SCSI devices.
config XEN_PRIVCMD
tristate
depends on XEN

View File

@ -36,6 +36,7 @@ obj-$(CONFIG_XEN_ACPI_HOTPLUG_MEMORY) += xen-acpi-memhotplug.o
obj-$(CONFIG_XEN_ACPI_HOTPLUG_CPU) += xen-acpi-cpuhotplug.o
obj-$(CONFIG_XEN_ACPI_PROCESSOR) += xen-acpi-processor.o
obj-$(CONFIG_XEN_EFI) += efi.o
obj-$(CONFIG_XEN_SCSI_BACKEND) += xen-scsiback.o
xen-evtchn-y := evtchn.o
xen-gntdev-y := gntdev.o
xen-gntalloc-y := gntalloc.o

View File

@ -27,6 +27,8 @@
#include <xen/interface/platform.h>
#include <xen/xen.h>
#include <asm/page.h>
#include <asm/xen/hypercall.h>
#define INIT_EFI_OP(name) \

View File

@ -900,8 +900,8 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu)
return irq;
}
static int bind_interdomain_evtchn_to_irq(unsigned int remote_domain,
unsigned int remote_port)
int bind_interdomain_evtchn_to_irq(unsigned int remote_domain,
unsigned int remote_port)
{
struct evtchn_bind_interdomain bind_interdomain;
int err;
@ -914,6 +914,7 @@ static int bind_interdomain_evtchn_to_irq(unsigned int remote_domain,
return err ? : bind_evtchn_to_irq(bind_interdomain.local_port);
}
EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irq);
static int find_virq(unsigned int virq, unsigned int cpu)
{

View File

@ -592,7 +592,7 @@ static int grow_gnttab_list(unsigned int more_frames)
return 0;
grow_nomem:
for ( ; i >= nr_glist_frames; i--)
while (i-- > nr_glist_frames)
free_page((unsigned long) gnttab_list[i]);
return -ENOMEM;
}

View File

@ -719,11 +719,13 @@ static const struct xenbus_device_id xen_pcibk_ids[] = {
{""},
};
static DEFINE_XENBUS_DRIVER(xen_pcibk, DRV_NAME,
static struct xenbus_driver xen_pcibk_driver = {
.name = DRV_NAME,
.ids = xen_pcibk_ids,
.probe = xen_pcibk_xenbus_probe,
.remove = xen_pcibk_xenbus_remove,
.otherend_changed = xen_pcibk_frontend_changed,
);
};
const struct xen_pcibk_backend *__read_mostly xen_pcibk_backend;

2126
drivers/xen/xen-scsiback.c Normal file

File diff suppressed because it is too large Load Diff

View File

@ -259,7 +259,6 @@ static char *error_path(struct xenbus_device *dev)
static void xenbus_va_dev_error(struct xenbus_device *dev, int err,
const char *fmt, va_list ap)
{
int ret;
unsigned int len;
char *printf_buffer = NULL;
char *path_buffer = NULL;
@ -270,9 +269,7 @@ static void xenbus_va_dev_error(struct xenbus_device *dev, int err,
goto fail;
len = sprintf(printf_buffer, "%i ", -err);
ret = vsnprintf(printf_buffer+len, PRINTF_BUFFER_SIZE-len, fmt, ap);
BUG_ON(len + ret > PRINTF_BUFFER_SIZE-1);
vsnprintf(printf_buffer+len, PRINTF_BUFFER_SIZE-len, fmt, ap);
dev_err(&dev->dev, "%s\n", printf_buffer);
@ -361,8 +358,8 @@ static void xenbus_switch_fatal(struct xenbus_device *dev, int depth, int err,
* @ring_mfn: mfn of ring to grant
* Grant access to the given @ring_mfn to the peer of the given device. Return
* 0 on success, or -errno on error. On error, the device will switch to
* XenbusStateClosing, and the error will be saved in the store.
* a grant reference on success, or -errno on error. On error, the device will
* switch to XenbusStateClosing, and the error will be saved in the store.
*/
int xenbus_grant_ring(struct xenbus_device *dev, unsigned long ring_mfn)
{

View File

@ -297,9 +297,13 @@ void xenbus_dev_shutdown(struct device *_dev)
EXPORT_SYMBOL_GPL(xenbus_dev_shutdown);
int xenbus_register_driver_common(struct xenbus_driver *drv,
struct xen_bus_type *bus)
struct xen_bus_type *bus,
struct module *owner, const char *mod_name)
{
drv->driver.name = drv->name ? drv->name : drv->ids[0].devicetype;
drv->driver.bus = &bus->bus;
drv->driver.owner = owner;
drv->driver.mod_name = mod_name;
return driver_register(&drv->driver);
}

View File

@ -60,7 +60,9 @@ extern int xenbus_match(struct device *_dev, struct device_driver *_drv);
extern int xenbus_dev_probe(struct device *_dev);
extern int xenbus_dev_remove(struct device *_dev);
extern int xenbus_register_driver_common(struct xenbus_driver *drv,
struct xen_bus_type *bus);
struct xen_bus_type *bus,
struct module *owner,
const char *mod_name);
extern int xenbus_probe_node(struct xen_bus_type *bus,
const char *type,
const char *nodename);

View File

@ -234,13 +234,15 @@ int xenbus_dev_is_online(struct xenbus_device *dev)
}
EXPORT_SYMBOL_GPL(xenbus_dev_is_online);
int xenbus_register_backend(struct xenbus_driver *drv)
int __xenbus_register_backend(struct xenbus_driver *drv, struct module *owner,
const char *mod_name)
{
drv->read_otherend_details = read_frontend_details;
return xenbus_register_driver_common(drv, &xenbus_backend);
return xenbus_register_driver_common(drv, &xenbus_backend,
owner, mod_name);
}
EXPORT_SYMBOL_GPL(xenbus_register_backend);
EXPORT_SYMBOL_GPL(__xenbus_register_backend);
static int backend_probe_and_watch(struct notifier_block *notifier,
unsigned long event,

View File

@ -317,13 +317,15 @@ static void wait_for_devices(struct xenbus_driver *xendrv)
print_device_status);
}
int xenbus_register_frontend(struct xenbus_driver *drv)
int __xenbus_register_frontend(struct xenbus_driver *drv, struct module *owner,
const char *mod_name)
{
int ret;
drv->read_otherend_details = read_backend_details;
ret = xenbus_register_driver_common(drv, &xenbus_frontend);
ret = xenbus_register_driver_common(drv, &xenbus_frontend,
owner, mod_name);
if (ret)
return ret;
@ -332,7 +334,7 @@ int xenbus_register_frontend(struct xenbus_driver *drv)
return 0;
}
EXPORT_SYMBOL_GPL(xenbus_register_frontend);
EXPORT_SYMBOL_GPL(__xenbus_register_frontend);
static DECLARE_WAIT_QUEUE_HEAD(backend_state_wq);
static int backend_state;

View File

@ -28,6 +28,8 @@ int bind_ipi_to_irqhandler(enum ipi_vector ipi,
unsigned long irqflags,
const char *devname,
void *dev_id);
int bind_interdomain_evtchn_to_irq(unsigned int remote_domain,
unsigned int remote_port);
int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain,
unsigned int remote_port,
irq_handler_t handler,

View File

@ -3,6 +3,24 @@
*
* Definitions used for the Xen ELF notes.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
* Copyright (c) 2006, Ian Campbell, XenSource Ltd.
*/
@ -18,12 +36,13 @@
*
* LEGACY indicated the fields in the legacy __xen_guest string which
* this a note type replaces.
*
* String values (for non-legacy) are NULL terminated ASCII, also known
* as ASCIZ type.
*/
/*
* NAME=VALUE pair (string).
*
* LEGACY: FEATURES and PAE
*/
#define XEN_ELFNOTE_INFO 0
@ -137,9 +156,29 @@
/*
* Whether or not the guest supports cooperative suspend cancellation.
* This is a numeric value.
*
* Default is 0
*/
#define XEN_ELFNOTE_SUSPEND_CANCEL 14
/*
* The (non-default) location the initial phys-to-machine map should be
* placed at by the hypervisor (Dom0) or the tools (DomU).
* The kernel must be prepared for this mapping to be established using
* large pages, despite such otherwise not being available to guests.
* The kernel must also be able to handle the page table pages used for
* this mapping not being accessible through the initial mapping.
* (Only x86-64 supports this at present.)
*/
#define XEN_ELFNOTE_INIT_P2M 15
/*
* Whether or not the guest can deal with being passed an initrd not
* mapped through its initial page tables.
*/
#define XEN_ELFNOTE_MOD_START_PFN 16
/*
* The features supported by this kernel (numeric).
*
@ -153,6 +192,11 @@
*/
#define XEN_ELFNOTE_SUPPORTED_FEATURES 17
/*
* The number of the highest elfnote defined.
*/
#define XEN_ELFNOTE_MAX XEN_ELFNOTE_SUPPORTED_FEATURES
#endif /* __XEN_PUBLIC_ELFNOTE_H__ */
/*

View File

@ -0,0 +1,229 @@
/******************************************************************************
* vscsiif.h
*
* Based on the blkif.h code.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
* Copyright(c) FUJITSU Limited 2008.
*/
#ifndef __XEN__PUBLIC_IO_SCSI_H__
#define __XEN__PUBLIC_IO_SCSI_H__
#include "ring.h"
#include "../grant_table.h"
/*
* Feature and Parameter Negotiation
* =================================
* The two halves of a Xen pvSCSI driver utilize nodes within the XenStore to
* communicate capabilities and to negotiate operating parameters. This
* section enumerates these nodes which reside in the respective front and
* backend portions of the XenStore, following the XenBus convention.
*
* Any specified default value is in effect if the corresponding XenBus node
* is not present in the XenStore.
*
* XenStore nodes in sections marked "PRIVATE" are solely for use by the
* driver side whose XenBus tree contains them.
*
*****************************************************************************
* Backend XenBus Nodes
*****************************************************************************
*
*------------------ Backend Device Identification (PRIVATE) ------------------
*
* p-devname
* Values: string
*
* A free string used to identify the physical device (e.g. a disk name).
*
* p-dev
* Values: string
*
* A string specifying the backend device: either a 4-tuple "h:c:t:l"
* (host, controller, target, lun, all integers), or a WWN (e.g.
* "naa.60014054ac780582").
*
* v-dev
* Values: string
*
* A string specifying the frontend device in form of a 4-tuple "h:c:t:l"
* (host, controller, target, lun, all integers).
*
*--------------------------------- Features ---------------------------------
*
* feature-sg-grant
* Values: unsigned [VSCSIIF_SG_TABLESIZE...65535]
* Default Value: 0
*
* Specifies the maximum number of scatter/gather elements in grant pages
* supported. If not set, the backend supports up to VSCSIIF_SG_TABLESIZE
* SG elements specified directly in the request.
*
*****************************************************************************
* Frontend XenBus Nodes
*****************************************************************************
*
*----------------------- Request Transport Parameters -----------------------
*
* event-channel
* Values: unsigned
*
* The identifier of the Xen event channel used to signal activity
* in the ring buffer.
*
* ring-ref
* Values: unsigned
*
* The Xen grant reference granting permission for the backend to map
* the sole page in a single page sized ring buffer.
*
* protocol
* Values: string (XEN_IO_PROTO_ABI_*)
* Default Value: XEN_IO_PROTO_ABI_NATIVE
*
* The machine ABI rules governing the format of all ring request and
* response structures.
*/
/* Requests from the frontend to the backend */
/*
* Request a SCSI operation specified via a CDB in vscsiif_request.cmnd.
* The target is specified via channel, id and lun.
*
* The operation to be performed is specified via a CDB in cmnd[], the length
* of the CDB is in cmd_len. sc_data_direction specifies the direction of data
* (to the device, from the device, or none at all).
*
* If data is to be transferred to or from the device the buffer(s) in the
* guest memory is/are specified via one or multiple scsiif_request_segment
* descriptors each specifying a memory page via a grant_ref_t, a offset into
* the page and the length of the area in that page. All scsiif_request_segment
* areas concatenated form the resulting data buffer used by the operation.
* If the number of scsiif_request_segment areas is not too large (less than
* or equal VSCSIIF_SG_TABLESIZE) the areas can be specified directly in the
* seg[] array and the number of valid scsiif_request_segment elements is to be
* set in nr_segments.
*
* If "feature-sg-grant" in the Xenstore is set it is possible to specify more
* than VSCSIIF_SG_TABLESIZE scsiif_request_segment elements via indirection.
* The maximum number of allowed scsiif_request_segment elements is the value
* of the "feature-sg-grant" entry from Xenstore. When using indirection the
* seg[] array doesn't contain specifications of the data buffers, but
* references to scsiif_request_segment arrays, which in turn reference the
* data buffers. While nr_segments holds the number of populated seg[] entries
* (plus the set VSCSIIF_SG_GRANT bit), the number of scsiif_request_segment
* elements referencing the target data buffers is calculated from the lengths
* of the seg[] elements (the sum of all valid seg[].length divided by the
* size of one scsiif_request_segment structure).
*/
#define VSCSIIF_ACT_SCSI_CDB 1
/*
* Request abort of a running operation for the specified target given by
* channel, id, lun and the operation's rqid in ref_rqid.
*/
#define VSCSIIF_ACT_SCSI_ABORT 2
/*
* Request a device reset of the specified target (channel and id).
*/
#define VSCSIIF_ACT_SCSI_RESET 3
/*
* Preset scatter/gather elements for a following request. Deprecated.
* Keeping the define only to avoid usage of the value "4" for other actions.
*/
#define VSCSIIF_ACT_SCSI_SG_PRESET 4
/*
* Maximum scatter/gather segments per request.
*
* Considering balance between allocating at least 16 "vscsiif_request"
* structures on one page (4096 bytes) and the number of scatter/gather
* elements needed, we decided to use 26 as a magic number.
*
* If "feature-sg-grant" is set, more scatter/gather elements can be specified
* by placing them in one or more (up to VSCSIIF_SG_TABLESIZE) granted pages.
* In this case the vscsiif_request seg elements don't contain references to
* the user data, but to the SG elements referencing the user data.
*/
#define VSCSIIF_SG_TABLESIZE 26
/*
* based on Linux kernel 2.6.18, still valid
* Changing these values requires support of multiple protocols via the rings
* as "old clients" will blindly use these values and the resulting structure
* sizes.
*/
#define VSCSIIF_MAX_COMMAND_SIZE 16
#define VSCSIIF_SENSE_BUFFERSIZE 96
struct scsiif_request_segment {
grant_ref_t gref;
uint16_t offset;
uint16_t length;
};
#define VSCSIIF_SG_PER_PAGE (PAGE_SIZE / sizeof(struct scsiif_request_segment))
/* Size of one request is 252 bytes */
struct vscsiif_request {
uint16_t rqid; /* private guest value, echoed in resp */
uint8_t act; /* command between backend and frontend */
uint8_t cmd_len; /* valid CDB bytes */
uint8_t cmnd[VSCSIIF_MAX_COMMAND_SIZE]; /* the CDB */
uint16_t timeout_per_command; /* deprecated */
uint16_t channel, id, lun; /* (virtual) device specification */
uint16_t ref_rqid; /* command abort reference */
uint8_t sc_data_direction; /* for DMA_TO_DEVICE(1)
DMA_FROM_DEVICE(2)
DMA_NONE(3) requests */
uint8_t nr_segments; /* Number of pieces of scatter-gather */
/*
* flag in nr_segments: SG elements via grant page
*
* If VSCSIIF_SG_GRANT is set, the low 7 bits of nr_segments specify the number
* of grant pages containing SG elements. Usable if "feature-sg-grant" set.
*/
#define VSCSIIF_SG_GRANT 0x80
struct scsiif_request_segment seg[VSCSIIF_SG_TABLESIZE];
uint32_t reserved[3];
};
/* Size of one response is 252 bytes */
struct vscsiif_response {
uint16_t rqid; /* identifies request */
uint8_t padding;
uint8_t sense_len;
uint8_t sense_buffer[VSCSIIF_SENSE_BUFFERSIZE];
int32_t rslt;
uint32_t residual_len; /* request bufflen -
return the value from physical device */
uint32_t reserved[36];
};
DEFINE_RING_TYPES(vscsiif, struct vscsiif_request, struct vscsiif_response);
#endif /*__XEN__PUBLIC_IO_SCSI_H__*/

View File

@ -3,6 +3,24 @@
*
* Guest OS interface to Xen.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*
* Copyright (c) 2004, K A Fraser
*/
@ -73,13 +91,23 @@
* VIRTUAL INTERRUPTS
*
* Virtual interrupts that a guest OS may receive from Xen.
* In the side comments, 'V.' denotes a per-VCPU VIRQ while 'G.' denotes a
* global VIRQ. The former can be bound once per VCPU and cannot be re-bound.
* The latter can be allocated only once per guest: they must initially be
* allocated to VCPU0 but can subsequently be re-bound.
*/
#define VIRQ_TIMER 0 /* Timebase update, and/or requested timeout. */
#define VIRQ_DEBUG 1 /* Request guest to dump debug info. */
#define VIRQ_CONSOLE 2 /* (DOM0) Bytes received on emergency console. */
#define VIRQ_DOM_EXC 3 /* (DOM0) Exceptional event for some domain. */
#define VIRQ_DEBUGGER 6 /* (DOM0) A domain has paused for debugging. */
#define VIRQ_PCPU_STATE 9 /* (DOM0) PCPU state changed */
#define VIRQ_TIMER 0 /* V. Timebase update, and/or requested timeout. */
#define VIRQ_DEBUG 1 /* V. Request guest to dump debug info. */
#define VIRQ_CONSOLE 2 /* G. (DOM0) Bytes received on emergency console. */
#define VIRQ_DOM_EXC 3 /* G. (DOM0) Exceptional event for some domain. */
#define VIRQ_TBUF 4 /* G. (DOM0) Trace buffer has records available. */
#define VIRQ_DEBUGGER 6 /* G. (DOM0) A domain has paused for debugging. */
#define VIRQ_XENOPROF 7 /* V. XenOprofile interrupt: new sample available */
#define VIRQ_CON_RING 8 /* G. (DOM0) Bytes received on console */
#define VIRQ_PCPU_STATE 9 /* G. (DOM0) PCPU state changed */
#define VIRQ_MEM_EVENT 10 /* G. (DOM0) A memory event has occured */
#define VIRQ_XC_RESERVED 11 /* G. Reserved for XenClient */
#define VIRQ_ENOMEM 12 /* G. (DOM0) Low on heap memory */
/* Architecture-specific VIRQ definitions. */
#define VIRQ_ARCH_0 16
@ -92,24 +120,68 @@
#define VIRQ_ARCH_7 23
#define NR_VIRQS 24
/*
* MMU-UPDATE REQUESTS
*
* HYPERVISOR_mmu_update() accepts a list of (ptr, val) pairs.
* A foreigndom (FD) can be specified (or DOMID_SELF for none).
* Where the FD has some effect, it is described below.
* ptr[1:0] specifies the appropriate MMU_* command.
* enum neg_errnoval HYPERVISOR_mmu_update(const struct mmu_update reqs[],
* unsigned count, unsigned *done_out,
* unsigned foreigndom)
* @reqs is an array of mmu_update_t structures ((ptr, val) pairs).
* @count is the length of the above array.
* @pdone is an output parameter indicating number of completed operations
* @foreigndom[15:0]: FD, the expected owner of data pages referenced in this
* hypercall invocation. Can be DOMID_SELF.
* @foreigndom[31:16]: PFD, the expected owner of pagetable pages referenced
* in this hypercall invocation. The value of this field
* (x) encodes the PFD as follows:
* x == 0 => PFD == DOMID_SELF
* x != 0 => PFD == x - 1
*
* Sub-commands: ptr[1:0] specifies the appropriate MMU_* command.
* -------------
* ptr[1:0] == MMU_NORMAL_PT_UPDATE:
* Updates an entry in a page table. If updating an L1 table, and the new
* table entry is valid/present, the mapped frame must belong to the FD, if
* an FD has been specified. If attempting to map an I/O page then the
* caller assumes the privilege of the FD.
* Updates an entry in a page table belonging to PFD. If updating an L1 table,
* and the new table entry is valid/present, the mapped frame must belong to
* FD. If attempting to map an I/O page then the caller assumes the privilege
* of the FD.
* FD == DOMID_IO: Permit /only/ I/O mappings, at the priv level of the caller.
* FD == DOMID_XEN: Map restricted areas of Xen's heap space.
* ptr[:2] -- Machine address of the page-table entry to modify.
* val -- Value to write.
*
* There also certain implicit requirements when using this hypercall. The
* pages that make up a pagetable must be mapped read-only in the guest.
* This prevents uncontrolled guest updates to the pagetable. Xen strictly
* enforces this, and will disallow any pagetable update which will end up
* mapping pagetable page RW, and will disallow using any writable page as a
* pagetable. In practice it means that when constructing a page table for a
* process, thread, etc, we MUST be very dilligient in following these rules:
* 1). Start with top-level page (PGD or in Xen language: L4). Fill out
* the entries.
* 2). Keep on going, filling out the upper (PUD or L3), and middle (PMD
* or L2).
* 3). Start filling out the PTE table (L1) with the PTE entries. Once
* done, make sure to set each of those entries to RO (so writeable bit
* is unset). Once that has been completed, set the PMD (L2) for this
* PTE table as RO.
* 4). When completed with all of the PMD (L2) entries, and all of them have
* been set to RO, make sure to set RO the PUD (L3). Do the same
* operation on PGD (L4) pagetable entries that have a PUD (L3) entry.
* 5). Now before you can use those pages (so setting the cr3), you MUST also
* pin them so that the hypervisor can verify the entries. This is done
* via the HYPERVISOR_mmuext_op(MMUEXT_PIN_L4_TABLE, guest physical frame
* number of the PGD (L4)). And this point the HYPERVISOR_mmuext_op(
* MMUEXT_NEW_BASEPTR, guest physical frame number of the PGD (L4)) can be
* issued.
* For 32-bit guests, the L4 is not used (as there is less pagetables), so
* instead use L3.
* At this point the pagetables can be modified using the MMU_NORMAL_PT_UPDATE
* hypercall. Also if so desired the OS can also try to write to the PTE
* and be trapped by the hypervisor (as the PTE entry is RO).
*
* To deallocate the pages, the operations are the reverse of the steps
* mentioned above. The argument is MMUEXT_UNPIN_TABLE for all levels and the
* pagetable MUST not be in use (meaning that the cr3 is not set to it).
*
* ptr[1:0] == MMU_MACHPHYS_UPDATE:
* Updates an entry in the machine->pseudo-physical mapping table.
* ptr[:2] -- Machine address within the frame whose mapping to modify.
@ -119,6 +191,72 @@
* ptr[1:0] == MMU_PT_UPDATE_PRESERVE_AD:
* As MMU_NORMAL_PT_UPDATE above, but A/D bits currently in the PTE are ORed
* with those in @val.
*
* @val is usually the machine frame number along with some attributes.
* The attributes by default follow the architecture defined bits. Meaning that
* if this is a X86_64 machine and four page table layout is used, the layout
* of val is:
* - 63 if set means No execute (NX)
* - 46-13 the machine frame number
* - 12 available for guest
* - 11 available for guest
* - 10 available for guest
* - 9 available for guest
* - 8 global
* - 7 PAT (PSE is disabled, must use hypercall to make 4MB or 2MB pages)
* - 6 dirty
* - 5 accessed
* - 4 page cached disabled
* - 3 page write through
* - 2 userspace accessible
* - 1 writeable
* - 0 present
*
* The one bits that does not fit with the default layout is the PAGE_PSE
* also called PAGE_PAT). The MMUEXT_[UN]MARK_SUPER arguments to the
* HYPERVISOR_mmuext_op serve as mechanism to set a pagetable to be 4MB
* (or 2MB) instead of using the PAGE_PSE bit.
*
* The reason that the PAGE_PSE (bit 7) is not being utilized is due to Xen
* using it as the Page Attribute Table (PAT) bit - for details on it please
* refer to Intel SDM 10.12. The PAT allows to set the caching attributes of
* pages instead of using MTRRs.
*
* The PAT MSR is as follows (it is a 64-bit value, each entry is 8 bits):
* PAT4 PAT0
* +-----+-----+----+----+----+-----+----+----+
* | UC | UC- | WC | WB | UC | UC- | WC | WB | <= Linux
* +-----+-----+----+----+----+-----+----+----+
* | UC | UC- | WT | WB | UC | UC- | WT | WB | <= BIOS (default when machine boots)
* +-----+-----+----+----+----+-----+----+----+
* | rsv | rsv | WP | WC | UC | UC- | WT | WB | <= Xen
* +-----+-----+----+----+----+-----+----+----+
*
* The lookup of this index table translates to looking up
* Bit 7, Bit 4, and Bit 3 of val entry:
*
* PAT/PSE (bit 7) ... PCD (bit 4) .. PWT (bit 3).
*
* If all bits are off, then we are using PAT0. If bit 3 turned on,
* then we are using PAT1, if bit 3 and bit 4, then PAT2..
*
* As you can see, the Linux PAT1 translates to PAT4 under Xen. Which means
* that if a guest that follows Linux's PAT setup and would like to set Write
* Combined on pages it MUST use PAT4 entry. Meaning that Bit 7 (PAGE_PAT) is
* set. For example, under Linux it only uses PAT0, PAT1, and PAT2 for the
* caching as:
*
* WB = none (so PAT0)
* WC = PWT (bit 3 on)
* UC = PWT | PCD (bit 3 and 4 are on).
*
* To make it work with Xen, it needs to translate the WC bit as so:
*
* PWT (so bit 3 on) --> PAT (so bit 7 is on) and clear bit 3
*
* And to translate back it would:
*
* PAT (bit 7 on) --> PWT (bit 3 on) and clear bit 7.
*/
#define MMU_NORMAL_PT_UPDATE 0 /* checked '*ptr = val'. ptr is MA. */
#define MMU_MACHPHYS_UPDATE 1 /* ptr = MA of frame to modify entry for */
@ -127,7 +265,12 @@
/*
* MMU EXTENDED OPERATIONS
*
* HYPERVISOR_mmuext_op() accepts a list of mmuext_op structures.
* enum neg_errnoval HYPERVISOR_mmuext_op(mmuext_op_t uops[],
* unsigned int count,
* unsigned int *pdone,
* unsigned int foreigndom)
*/
/* HYPERVISOR_mmuext_op() accepts a list of mmuext_op structures.
* A foreigndom (FD) can be specified (or DOMID_SELF for none).
* Where the FD has some effect, it is described below.
*
@ -164,9 +307,23 @@
* cmd: MMUEXT_FLUSH_CACHE
* No additional arguments. Writes back and flushes cache contents.
*
* cmd: MMUEXT_FLUSH_CACHE_GLOBAL
* No additional arguments. Writes back and flushes cache contents
* on all CPUs in the system.
*
* cmd: MMUEXT_SET_LDT
* linear_addr: Linear address of LDT base (NB. must be page-aligned).
* nr_ents: Number of entries in LDT.
*
* cmd: MMUEXT_CLEAR_PAGE
* mfn: Machine frame number to be cleared.
*
* cmd: MMUEXT_COPY_PAGE
* mfn: Machine frame number of the destination page.
* src_mfn: Machine frame number of the source page.
*
* cmd: MMUEXT_[UN]MARK_SUPER
* mfn: Machine frame number of head of superpage to be [un]marked.
*/
#define MMUEXT_PIN_L1_TABLE 0
#define MMUEXT_PIN_L2_TABLE 1
@ -183,12 +340,18 @@
#define MMUEXT_FLUSH_CACHE 12
#define MMUEXT_SET_LDT 13
#define MMUEXT_NEW_USER_BASEPTR 15
#define MMUEXT_CLEAR_PAGE 16
#define MMUEXT_COPY_PAGE 17
#define MMUEXT_FLUSH_CACHE_GLOBAL 18
#define MMUEXT_MARK_SUPER 19
#define MMUEXT_UNMARK_SUPER 20
#ifndef __ASSEMBLY__
struct mmuext_op {
unsigned int cmd;
union {
/* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR */
/* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR
* CLEAR_PAGE, COPY_PAGE, [UN]MARK_SUPER */
xen_pfn_t mfn;
/* INVLPG_LOCAL, INVLPG_ALL, SET_LDT */
unsigned long linear_addr;
@ -198,6 +361,8 @@ struct mmuext_op {
unsigned int nr_ents;
/* TLB_FLUSH_MULTI, INVLPG_MULTI */
void *vcpumask;
/* COPY_PAGE */
xen_pfn_t src_mfn;
} arg2;
};
DEFINE_GUEST_HANDLE_STRUCT(mmuext_op);
@ -225,10 +390,23 @@ DEFINE_GUEST_HANDLE_STRUCT(mmuext_op);
*/
#define VMASST_CMD_enable 0
#define VMASST_CMD_disable 1
/* x86/32 guests: simulate full 4GB segment limits. */
#define VMASST_TYPE_4gb_segments 0
/* x86/32 guests: trap (vector 15) whenever above vmassist is used. */
#define VMASST_TYPE_4gb_segments_notify 1
/*
* x86 guests: support writes to bottom-level PTEs.
* NB1. Page-directory entries cannot be written.
* NB2. Guest must continue to remove all writable mappings of PTEs.
*/
#define VMASST_TYPE_writable_pagetables 2
/* x86/PAE guests: support PDPTs above 4GB. */
#define VMASST_TYPE_pae_extended_cr3 3
#define MAX_VMASST_TYPE 3
#ifndef __ASSEMBLY__
@ -260,6 +438,15 @@ typedef uint16_t domid_t;
*/
#define DOMID_XEN (0x7FF2U)
/* DOMID_COW is used as the owner of sharable pages */
#define DOMID_COW (0x7FF3U)
/* DOMID_INVALID is used to identify pages with unknown owner. */
#define DOMID_INVALID (0x7FF4U)
/* Idle domain. */
#define DOMID_IDLE (0x7FFFU)
/*
* Send an array of these to HYPERVISOR_mmu_update().
* NB. The fields are natural pointer/address size for this architecture.
@ -272,7 +459,9 @@ DEFINE_GUEST_HANDLE_STRUCT(mmu_update);
/*
* Send an array of these to HYPERVISOR_multicall().
* NB. The fields are natural register size for this architecture.
* NB. The fields are logically the natural register size for this
* architecture. In cases where xen_ulong_t is larger than this then
* any unused bits in the upper portion must be zero.
*/
struct multicall_entry {
xen_ulong_t op;
@ -442,8 +631,48 @@ struct start_info {
unsigned long mod_start; /* VIRTUAL address of pre-loaded module. */
unsigned long mod_len; /* Size (bytes) of pre-loaded module. */
int8_t cmd_line[MAX_GUEST_CMDLINE];
/* The pfn range here covers both page table and p->m table frames. */
unsigned long first_p2m_pfn;/* 1st pfn forming initial P->M table. */
unsigned long nr_p2m_frames;/* # of pfns forming initial P->M table. */
};
/* These flags are passed in the 'flags' field of start_info_t. */
#define SIF_PRIVILEGED (1<<0) /* Is the domain privileged? */
#define SIF_INITDOMAIN (1<<1) /* Is this the initial control domain? */
#define SIF_MULTIBOOT_MOD (1<<2) /* Is mod_start a multiboot module? */
#define SIF_MOD_START_PFN (1<<3) /* Is mod_start a PFN? */
#define SIF_PM_MASK (0xFF<<8) /* reserve 1 byte for xen-pm options */
/*
* A multiboot module is a package containing modules very similar to a
* multiboot module array. The only differences are:
* - the array of module descriptors is by convention simply at the beginning
* of the multiboot module,
* - addresses in the module descriptors are based on the beginning of the
* multiboot module,
* - the number of modules is determined by a termination descriptor that has
* mod_start == 0.
*
* This permits to both build it statically and reference it in a configuration
* file, and let the PV guest easily rebase the addresses to virtual addresses
* and at the same time count the number of modules.
*/
struct xen_multiboot_mod_list {
/* Address of first byte of the module */
uint32_t mod_start;
/* Address of last byte of the module (inclusive) */
uint32_t mod_end;
/* Address of zero-terminated command line */
uint32_t cmdline;
/* Unused, must be zero */
uint32_t pad;
};
/*
* The console structure in start_info.console.dom0
*
* This structure includes a variety of information required to
* have a working VGA/VESA console.
*/
struct dom0_vga_console_info {
uint8_t video_type;
#define XEN_VGATYPE_TEXT_MODE_3 0x03
@ -484,11 +713,6 @@ struct dom0_vga_console_info {
} u;
};
/* These flags are passed in the 'flags' field of start_info_t. */
#define SIF_PRIVILEGED (1<<0) /* Is the domain privileged? */
#define SIF_INITDOMAIN (1<<1) /* Is this the initial control domain? */
#define SIF_PM_MASK (0xFF<<8) /* reserve 1 byte for xen-pm options */
typedef uint64_t cpumap_t;
typedef uint8_t xen_domain_handle_t[16];

View File

@ -86,6 +86,7 @@ struct xenbus_device_id
/* A xenbus driver. */
struct xenbus_driver {
const char *name; /* defaults to ids[0].devicetype */
const struct xenbus_device_id *ids;
int (*probe)(struct xenbus_device *dev,
const struct xenbus_device_id *id);
@ -100,20 +101,22 @@ struct xenbus_driver {
int (*is_ready)(struct xenbus_device *dev);
};
#define DEFINE_XENBUS_DRIVER(var, drvname, methods...) \
struct xenbus_driver var ## _driver = { \
.driver.name = drvname + 0 ?: var ## _ids->devicetype, \
.driver.owner = THIS_MODULE, \
.ids = var ## _ids, ## methods \
}
static inline struct xenbus_driver *to_xenbus_driver(struct device_driver *drv)
{
return container_of(drv, struct xenbus_driver, driver);
}
int __must_check xenbus_register_frontend(struct xenbus_driver *);
int __must_check xenbus_register_backend(struct xenbus_driver *);
int __must_check __xenbus_register_frontend(struct xenbus_driver *drv,
struct module *owner,
const char *mod_name);
int __must_check __xenbus_register_backend(struct xenbus_driver *drv,
struct module *owner,
const char *mod_name);
#define xenbus_register_frontend(drv) \
__xenbus_register_frontend(drv, THIS_MODULE, KBUILD_MODNAME);
#define xenbus_register_backend(drv) \
__xenbus_register_backend(drv, THIS_MODULE, KBUILD_MODNAME);
void xenbus_unregister_driver(struct xenbus_driver *drv);