Commit c742b31c03f37c5c499178f09f57381aa6c70131

Authored by Martin Schwidefsky
1 parent 9cfb9b3c3a

[PATCH] fast vdso implementation for CLOCK_THREAD_CPUTIME_ID

The extract cpu time instruction (ectg) instruction allows the user
process to get the current thread cputime without calling into the
kernel. The code that uses the instruction needs to switch to the
access registers mode to get access to the per-cpu info page that
contains the two base values that are needed to calculate the current
cputime from the CPU timer with the ectg instruction.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>

Showing 10 changed files with 222 additions and 32 deletions Side-by-side Diff

arch/s390/include/asm/lowcore.h
... ... @@ -106,8 +106,10 @@
106 106 #define __LC_IPLDEV 0xDB8
107 107 #define __LC_CURRENT 0xDD8
108 108 #define __LC_INT_CLOCK 0xDE8
  109 +#define __LC_VDSO_PER_CPU 0xE38
109 110 #endif /* __s390x__ */
110 111  
  112 +#define __LC_PASTE 0xE40
111 113  
112 114 #define __LC_PANIC_MAGIC 0xE00
113 115 #ifndef __s390x__
... ... @@ -381,7 +383,12 @@
381 383 /* whether the kernel died with panic() or not */
382 384 __u32 panic_magic; /* 0xe00 */
383 385  
384   - __u8 pad13[0x11b8-0xe04]; /* 0xe04 */
  386 + /* Per cpu primary space access list */
  387 + __u8 pad_0xe04[0xe3c-0xe04]; /* 0xe04 */
  388 + __u32 vdso_per_cpu_data; /* 0xe3c */
  389 + __u32 paste[16]; /* 0xe40 */
  390 +
  391 + __u8 pad13[0x11b8-0xe80]; /* 0xe80 */
385 392  
386 393 /* 64 bit extparam used for pfault, diag 250 etc */
387 394 __u64 ext_params2; /* 0x11B8 */
arch/s390/include/asm/vdso.h
... ... @@ -12,9 +12,9 @@
12 12 #ifndef __ASSEMBLY__
13 13  
14 14 /*
15   - * Note about this structure:
  15 + * Note about the vdso_data and vdso_per_cpu_data structures:
16 16 *
17   - * NEVER USE THIS IN USERSPACE CODE DIRECTLY. The layout of this
  17 + * NEVER USE THEM IN USERSPACE CODE DIRECTLY. The layout of the
18 18 * structure is supposed to be known only to the function in the vdso
19 19 * itself and may change without notice.
20 20 */
21 21  
22 22  
... ... @@ -28,9 +28,20 @@
28 28 __u64 wtom_clock_nsec; /* 0x28 */
29 29 __u32 tz_minuteswest; /* Minutes west of Greenwich 0x30 */
30 30 __u32 tz_dsttime; /* Type of dst correction 0x34 */
  31 + __u32 ectg_available;
31 32 };
32 33  
  34 +struct vdso_per_cpu_data {
  35 + __u64 ectg_timer_base;
  36 + __u64 ectg_user_time;
  37 +};
  38 +
33 39 extern struct vdso_data *vdso_data;
  40 +
  41 +#ifdef CONFIG_64BIT
  42 +int vdso_alloc_per_cpu(int cpu, struct _lowcore *lowcore);
  43 +void vdso_free_per_cpu(int cpu, struct _lowcore *lowcore);
  44 +#endif
34 45  
35 46 #endif /* __ASSEMBLY__ */
36 47  
arch/s390/kernel/asm-offsets.c
... ... @@ -48,6 +48,11 @@
48 48 DEFINE(__VDSO_WTOM_SEC, offsetof(struct vdso_data, wtom_clock_sec));
49 49 DEFINE(__VDSO_WTOM_NSEC, offsetof(struct vdso_data, wtom_clock_nsec));
50 50 DEFINE(__VDSO_TIMEZONE, offsetof(struct vdso_data, tz_minuteswest));
  51 + DEFINE(__VDSO_ECTG_OK, offsetof(struct vdso_data, ectg_available));
  52 + DEFINE(__VDSO_ECTG_BASE,
  53 + offsetof(struct vdso_per_cpu_data, ectg_timer_base));
  54 + DEFINE(__VDSO_ECTG_USER,
  55 + offsetof(struct vdso_per_cpu_data, ectg_user_time));
51 56 /* constants used by the vdso */
52 57 DEFINE(CLOCK_REALTIME, CLOCK_REALTIME);
53 58 DEFINE(CLOCK_MONOTONIC, CLOCK_MONOTONIC);
arch/s390/kernel/entry64.S
... ... @@ -177,8 +177,11 @@
177 177 .if !\sync
178 178 ni \psworg+1,0xfd # clear wait state bit
179 179 .endif
180   - lmg %r0,%r15,SP_R0(%r15) # load gprs 0-15 of user
  180 + lg %r14,__LC_VDSO_PER_CPU
  181 + lmg %r0,%r13,SP_R0(%r15) # load gprs 0-13 of user
181 182 stpt __LC_EXIT_TIMER
  183 + mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER
  184 + lmg %r14,%r15,SP_R14(%r15) # load grps 14-15 of user
182 185 lpswe \psworg # back to caller
183 186 .endm
184 187  
185 188  
186 189  
187 190  
188 191  
189 192  
... ... @@ -980,23 +983,23 @@
980 983  
981 984 cleanup_sysc_leave:
982 985 clc 8(8,%r12),BASED(cleanup_sysc_leave_insn)
983   - je 2f
984   - mvc __LC_EXIT_TIMER(8),__LC_ASYNC_ENTER_TIMER
  986 + je 3f
985 987 clc 8(8,%r12),BASED(cleanup_sysc_leave_insn+8)
986   - je 2f
987   - mvc __LC_RETURN_PSW(16),SP_PSW(%r15)
  988 + jhe 0f
  989 + mvc __LC_EXIT_TIMER(8),__LC_ASYNC_ENTER_TIMER
  990 +0: mvc __LC_RETURN_PSW(16),SP_PSW(%r15)
988 991 cghi %r12,__LC_MCK_OLD_PSW
989   - jne 0f
  992 + jne 1f
990 993 mvc __LC_SAVE_AREA+64(32),SP_R12(%r15)
991   - j 1f
992   -0: mvc __LC_SAVE_AREA+32(32),SP_R12(%r15)
993   -1: lmg %r0,%r11,SP_R0(%r15)
  994 + j 2f
  995 +1: mvc __LC_SAVE_AREA+32(32),SP_R12(%r15)
  996 +2: lmg %r0,%r11,SP_R0(%r15)
994 997 lg %r15,SP_R15(%r15)
995   -2: la %r12,__LC_RETURN_PSW
  998 +3: la %r12,__LC_RETURN_PSW
996 999 br %r14
997 1000 cleanup_sysc_leave_insn:
998 1001 .quad sysc_done - 4
999   - .quad sysc_done - 8
  1002 + .quad sysc_done - 16
1000 1003  
1001 1004 cleanup_io_return:
1002 1005 mvc __LC_RETURN_PSW(8),0(%r12)
1003 1006  
1004 1007  
1005 1008  
1006 1009  
1007 1010  
... ... @@ -1006,23 +1009,23 @@
1006 1009  
1007 1010 cleanup_io_leave:
1008 1011 clc 8(8,%r12),BASED(cleanup_io_leave_insn)
1009   - je 2f
1010   - mvc __LC_EXIT_TIMER(8),__LC_ASYNC_ENTER_TIMER
  1012 + je 3f
1011 1013 clc 8(8,%r12),BASED(cleanup_io_leave_insn+8)
1012   - je 2f
1013   - mvc __LC_RETURN_PSW(16),SP_PSW(%r15)
  1014 + jhe 0f
  1015 + mvc __LC_EXIT_TIMER(8),__LC_ASYNC_ENTER_TIMER
  1016 +0: mvc __LC_RETURN_PSW(16),SP_PSW(%r15)
1014 1017 cghi %r12,__LC_MCK_OLD_PSW
1015   - jne 0f
  1018 + jne 1f
1016 1019 mvc __LC_SAVE_AREA+64(32),SP_R12(%r15)
1017   - j 1f
1018   -0: mvc __LC_SAVE_AREA+32(32),SP_R12(%r15)
1019   -1: lmg %r0,%r11,SP_R0(%r15)
  1020 + j 2f
  1021 +1: mvc __LC_SAVE_AREA+32(32),SP_R12(%r15)
  1022 +2: lmg %r0,%r11,SP_R0(%r15)
1020 1023 lg %r15,SP_R15(%r15)
1021   -2: la %r12,__LC_RETURN_PSW
  1024 +3: la %r12,__LC_RETURN_PSW
1022 1025 br %r14
1023 1026 cleanup_io_leave_insn:
1024 1027 .quad io_done - 4
1025   - .quad io_done - 8
  1028 + .quad io_done - 16
1026 1029  
1027 1030 /*
1028 1031 * Integer constants
arch/s390/kernel/head64.S
... ... @@ -87,6 +87,8 @@
87 87 lg %r12,.Lparmaddr-.LPG1(%r13) # pointer to parameter area
88 88 # move IPL device to lowcore
89 89 mvc __LC_IPLDEV(4),IPL_DEVICE+4-PARMAREA(%r12)
  90 + lghi %r0,__LC_PASTE
  91 + stg %r0,__LC_VDSO_PER_CPU
90 92 #
91 93 # Setup stack
92 94 #
arch/s390/kernel/setup.c
... ... @@ -427,6 +427,8 @@
427 427 /* enable extended save area */
428 428 __ctl_set_bit(14, 29);
429 429 }
  430 +#else
  431 + lc->vdso_per_cpu_data = (unsigned long) &lc->paste[0];
430 432 #endif
431 433 set_prefix((u32)(unsigned long) lc);
432 434 }
arch/s390/kernel/smp.c
... ... @@ -47,6 +47,7 @@
47 47 #include <asm/lowcore.h>
48 48 #include <asm/sclp.h>
49 49 #include <asm/cpu.h>
  50 +#include <asm/vdso.h>
50 51 #include "entry.h"
51 52  
52 53 /*
... ... @@ -506,6 +507,9 @@
506 507 goto out;
507 508 lowcore->extended_save_area_addr = (u32) save_area;
508 509 }
  510 +#else
  511 + if (vdso_alloc_per_cpu(cpu, lowcore))
  512 + goto out;
509 513 #endif
510 514 lowcore_ptr[cpu] = lowcore;
511 515 return 0;
... ... @@ -528,6 +532,8 @@
528 532 #ifndef CONFIG_64BIT
529 533 if (MACHINE_HAS_IEEE)
530 534 free_page((unsigned long) lowcore->extended_save_area_addr);
  535 +#else
  536 + vdso_free_per_cpu(cpu, lowcore);
531 537 #endif
532 538 free_page(lowcore->panic_stack - PAGE_SIZE);
533 539 free_pages(lowcore->async_stack - ASYNC_SIZE, ASYNC_ORDER);
... ... @@ -670,6 +676,7 @@
670 676 lowcore = (void *) __get_free_pages(GFP_KERNEL | GFP_DMA, lc_order);
671 677 panic_stack = __get_free_page(GFP_KERNEL);
672 678 async_stack = __get_free_pages(GFP_KERNEL, ASYNC_ORDER);
  679 + BUG_ON(!lowcore || !panic_stack || !async_stack);
673 680 #ifndef CONFIG_64BIT
674 681 if (MACHINE_HAS_IEEE)
675 682 save_area = get_zeroed_page(GFP_KERNEL);
... ... @@ -683,6 +690,8 @@
683 690 #ifndef CONFIG_64BIT
684 691 if (MACHINE_HAS_IEEE)
685 692 lowcore->extended_save_area_addr = (u32) save_area;
  693 +#else
  694 + BUG_ON(vdso_alloc_per_cpu(smp_processor_id(), lowcore));
686 695 #endif
687 696 set_prefix((u32)(unsigned long) lowcore);
688 697 local_mcck_enable();
arch/s390/kernel/vdso.c
... ... @@ -31,9 +31,6 @@
31 31 #include <asm/sections.h>
32 32 #include <asm/vdso.h>
33 33  
34   -/* Max supported size for symbol names */
35   -#define MAX_SYMNAME 64
36   -
37 34 #if defined(CONFIG_32BIT) || defined(CONFIG_COMPAT)
38 35 extern char vdso32_start, vdso32_end;
39 36 static void *vdso32_kbase = &vdso32_start;
... ... @@ -71,6 +68,119 @@
71 68 struct vdso_data *vdso_data = &vdso_data_store.data;
72 69  
73 70 /*
  71 + * Setup vdso data page.
  72 + */
  73 +static void vdso_init_data(struct vdso_data *vd)
  74 +{
  75 + unsigned int facility_list;
  76 +
  77 + facility_list = stfl();
  78 + vd->ectg_available = switch_amode && (facility_list & 1);
  79 +}
  80 +
  81 +#ifdef CONFIG_64BIT
  82 +/*
  83 + * Setup per cpu vdso data page.
  84 + */
  85 +static void vdso_init_per_cpu_data(int cpu, struct vdso_per_cpu_data *vpcd)
  86 +{
  87 +}
  88 +
  89 +/*
  90 + * Allocate/free per cpu vdso data.
  91 + */
  92 +#ifdef CONFIG_64BIT
  93 +#define SEGMENT_ORDER 2
  94 +#else
  95 +#define SEGMENT_ORDER 1
  96 +#endif
  97 +
  98 +int vdso_alloc_per_cpu(int cpu, struct _lowcore *lowcore)
  99 +{
  100 + unsigned long segment_table, page_table, page_frame;
  101 + u32 *psal, *aste;
  102 + int i;
  103 +
  104 + lowcore->vdso_per_cpu_data = __LC_PASTE;
  105 +
  106 + if (!switch_amode || !vdso_enabled)
  107 + return 0;
  108 +
  109 + segment_table = __get_free_pages(GFP_KERNEL, SEGMENT_ORDER);
  110 + page_table = get_zeroed_page(GFP_KERNEL | GFP_DMA);
  111 + page_frame = get_zeroed_page(GFP_KERNEL);
  112 + if (!segment_table || !page_table || !page_frame)
  113 + goto out;
  114 +
  115 + clear_table((unsigned long *) segment_table, _SEGMENT_ENTRY_EMPTY,
  116 + PAGE_SIZE << SEGMENT_ORDER);
  117 + clear_table((unsigned long *) page_table, _PAGE_TYPE_EMPTY,
  118 + 256*sizeof(unsigned long));
  119 +
  120 + *(unsigned long *) segment_table = _SEGMENT_ENTRY + page_table;
  121 + *(unsigned long *) page_table = _PAGE_RO + page_frame;
  122 +
  123 + psal = (u32 *) (page_table + 256*sizeof(unsigned long));
  124 + aste = psal + 32;
  125 +
  126 + for (i = 4; i < 32; i += 4)
  127 + psal[i] = 0x80000000;
  128 +
  129 + lowcore->paste[4] = (u32)(addr_t) psal;
  130 + psal[0] = 0x20000000;
  131 + psal[2] = (u32)(addr_t) aste;
  132 + *(unsigned long *) (aste + 2) = segment_table +
  133 + _ASCE_TABLE_LENGTH + _ASCE_USER_BITS + _ASCE_TYPE_SEGMENT;
  134 + aste[4] = (u32)(addr_t) psal;
  135 + lowcore->vdso_per_cpu_data = page_frame;
  136 +
  137 + vdso_init_per_cpu_data(cpu, (struct vdso_per_cpu_data *) page_frame);
  138 + return 0;
  139 +
  140 +out:
  141 + free_page(page_frame);
  142 + free_page(page_table);
  143 + free_pages(segment_table, SEGMENT_ORDER);
  144 + return -ENOMEM;
  145 +}
  146 +
  147 +#ifdef CONFIG_HOTPLUG_CPU
  148 +void vdso_free_per_cpu(int cpu, struct _lowcore *lowcore)
  149 +{
  150 + unsigned long segment_table, page_table, page_frame;
  151 + u32 *psal, *aste;
  152 +
  153 + if (!switch_amode || !vdso_enabled)
  154 + return;
  155 +
  156 + psal = (u32 *)(addr_t) lowcore->paste[4];
  157 + aste = (u32 *)(addr_t) psal[2];
  158 + segment_table = *(unsigned long *)(aste + 2) & PAGE_MASK;
  159 + page_table = *(unsigned long *) segment_table;
  160 + page_frame = *(unsigned long *) page_table;
  161 +
  162 + free_page(page_frame);
  163 + free_page(page_table);
  164 + free_pages(segment_table, SEGMENT_ORDER);
  165 +}
  166 +#endif /* CONFIG_HOTPLUG_CPU */
  167 +
  168 +static void __vdso_init_cr5(void *dummy)
  169 +{
  170 + unsigned long cr5;
  171 +
  172 + cr5 = offsetof(struct _lowcore, paste);
  173 + __ctl_load(cr5, 5, 5);
  174 +}
  175 +
  176 +static void vdso_init_cr5(void)
  177 +{
  178 + if (switch_amode && vdso_enabled)
  179 + on_each_cpu(__vdso_init_cr5, NULL, 1);
  180 +}
  181 +#endif /* CONFIG_64BIT */
  182 +
  183 +/*
74 184 * This is called from binfmt_elf, we create the special vma for the
75 185 * vDSO and insert it into the mm struct tree
76 186 */
... ... @@ -172,6 +282,9 @@
172 282 {
173 283 int i;
174 284  
  285 + if (!vdso_enabled)
  286 + return 0;
  287 + vdso_init_data(vdso_data);
175 288 #if defined(CONFIG_32BIT) || defined(CONFIG_COMPAT)
176 289 /* Calculate the size of the 32 bit vDSO */
177 290 vdso32_pages = ((&vdso32_end - &vdso32_start
... ... @@ -208,6 +321,10 @@
208 321 }
209 322 vdso64_pagelist[vdso64_pages - 1] = virt_to_page(vdso_data);
210 323 vdso64_pagelist[vdso64_pages] = NULL;
  324 +#ifndef CONFIG_SMP
  325 + BUG_ON(vdso_alloc_per_cpu(0, S390_lowcore));
  326 +#endif
  327 + vdso_init_cr5();
211 328 #endif /* CONFIG_64BIT */
212 329  
213 330 get_page(virt_to_page(vdso_data));
arch/s390/kernel/vdso64/clock_getres.S
... ... @@ -22,7 +22,12 @@
22 22 cghi %r2,CLOCK_REALTIME
23 23 je 0f
24 24 cghi %r2,CLOCK_MONOTONIC
  25 + je 0f
  26 + cghi %r2,-2 /* CLOCK_THREAD_CPUTIME_ID for this thread */
25 27 jne 2f
  28 + larl %r5,_vdso_data
  29 + icm %r0,15,__LC_ECTG_OK(%r5)
  30 + jz 2f
26 31 0: ltgr %r3,%r3
27 32 jz 1f /* res == NULL */
28 33 larl %r1,3f
arch/s390/kernel/vdso64/clock_gettime.S
... ... @@ -22,8 +22,10 @@
22 22 larl %r5,_vdso_data
23 23 cghi %r2,CLOCK_REALTIME
24 24 je 4f
  25 + cghi %r2,-2 /* CLOCK_THREAD_CPUTIME_ID for this thread */
  26 + je 9f
25 27 cghi %r2,CLOCK_MONOTONIC
26   - jne 9f
  28 + jne 12f
27 29  
28 30 /* CLOCK_MONOTONIC */
29 31 ltgr %r3,%r3
... ... @@ -42,7 +44,7 @@
42 44 alg %r0,__VDSO_WTOM_SEC(%r5)
43 45 clg %r4,__VDSO_UPD_COUNT(%r5) /* check update counter */
44 46 jne 0b
45   - larl %r5,10f
  47 + larl %r5,13f
46 48 1: clg %r1,0(%r5)
47 49 jl 2f
48 50 slg %r1,0(%r5)
... ... @@ -68,7 +70,7 @@
68 70 lg %r0,__VDSO_XTIME_SEC(%r5)
69 71 clg %r4,__VDSO_UPD_COUNT(%r5) /* check update counter */
70 72 jne 5b
71   - larl %r5,10f
  73 + larl %r5,13f
72 74 6: clg %r1,0(%r5)
73 75 jl 7f
74 76 slg %r1,0(%r5)
75 77  
76 78  
... ... @@ -79,12 +81,39 @@
79 81 8: lghi %r2,0
80 82 br %r14
81 83  
  84 + /* CLOCK_THREAD_CPUTIME_ID for this thread */
  85 +9: icm %r0,15,__VDSO_ECTG_OK(%r5)
  86 + jz 12f
  87 + ear %r2,%a4
  88 + llilh %r4,0x0100
  89 + sar %a4,%r4
  90 + lghi %r4,0
  91 + sacf 512 /* Magic ectg instruction */
  92 + .insn ssf,0xc80100000000,__VDSO_ECTG_BASE(4),__VDSO_ECTG_USER(4),4
  93 + sacf 0
  94 + sar %a4,%r2
  95 + algr %r1,%r0 /* r1 = cputime as TOD value */
  96 + mghi %r1,1000 /* convert to nanoseconds */
  97 + srlg %r1,%r1,12 /* r1 = cputime in nanosec */
  98 + lgr %r4,%r1
  99 + larl %r5,13f
  100 + srlg %r1,%r1,9 /* divide by 1000000000 */
  101 + mlg %r0,8(%r5)
  102 + srlg %r0,%r0,11 /* r0 = tv_sec */
  103 + stg %r0,0(%r3)
  104 + msg %r0,0(%r5) /* calculate tv_nsec */
  105 + slgr %r4,%r0 /* r4 = tv_nsec */
  106 + stg %r4,8(%r3)
  107 + lghi %r2,0
  108 + br %r14
  109 +
82 110 /* Fallback to system call */
83   -9: lghi %r1,__NR_clock_gettime
  111 +12: lghi %r1,__NR_clock_gettime
84 112 svc 0
85 113 br %r14
86 114  
87   -10: .quad 1000000000
  115 +13: .quad 1000000000
  116 +14: .quad 19342813113834067
88 117 .cfi_endproc
89 118 .size __kernel_clock_gettime,.-__kernel_clock_gettime