linux/drivers/media/platform/vivi.c

1543 lines
38 KiB
C
Raw Normal View History

/*
* Virtual Video driver - This code emulates a real video device with v4l2 api
*
* Copyright (c) 2006 by:
* Mauro Carvalho Chehab <mchehab--a.t--infradead.org>
* Ted Walther <ted--a.t--enumera.com>
* John Sokol <sokol--a.t--videotechnology.com>
* http://v4l.videotechnology.com/
*
* Conversion to videobuf2 by Pawel Osciak & Marek Szyprowski
* Copyright (c) 2010 Samsung Electronics
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the BSD Licence, GNU General Public License
* as published by the Free Software Foundation; either version 2 of the
* License, or (at your option) any later version
*/
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/font.h>
#include <linux/mutex.h>
#include <linux/videodev2.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
#include <media/videobuf2-vmalloc.h>
#include <media/v4l2-device.h>
#include <media/v4l2-ioctl.h>
#include <media/v4l2-ctrls.h>
#include <media/v4l2-fh.h>
#include <media/v4l2-event.h>
#include <media/v4l2-common.h>
#define VIVI_MODULE_NAME "vivi"
/* Maximum allowed frame rate
*
* Vivi will allow setting timeperframe in [1/FPS_MAX - FPS_MAX/1] range.
*
* Ideally FPS_MAX should be infinity, i.e. practically UINT_MAX, but that
* might hit application errors when they manipulate these values.
*
* Besides, for tpf < 1ms image-generation logic should be changed, to avoid
* producing frames with equal content.
*/
#define FPS_MAX 1000
#define MAX_WIDTH 1920
#define MAX_HEIGHT 1200
#define VIVI_VERSION "0.8.1"
MODULE_DESCRIPTION("Video Technology Magazine Virtual Video Capture Board");
MODULE_AUTHOR("Mauro Carvalho Chehab, Ted Walther and John Sokol");
MODULE_LICENSE("Dual BSD/GPL");
MODULE_VERSION(VIVI_VERSION);
static unsigned video_nr = -1;
module_param(video_nr, uint, 0644);
MODULE_PARM_DESC(video_nr, "videoX start number, -1 is autodetect");
static unsigned n_devs = 1;
module_param(n_devs, uint, 0644);
MODULE_PARM_DESC(n_devs, "number of video devices to create");
static unsigned debug;
module_param(debug, uint, 0644);
MODULE_PARM_DESC(debug, "activates debug info");
/* Global font descriptor */
static const u8 *font8x16;
/* timeperframe: min/max and default */
static const struct v4l2_fract
tpf_min = {.numerator = 1, .denominator = FPS_MAX},
tpf_max = {.numerator = FPS_MAX, .denominator = 1},
tpf_default = {.numerator = 1001, .denominator = 30000}; /* NTSC */
#define dprintk(dev, level, fmt, arg...) \
v4l2_dbg(level, debug, &dev->v4l2_dev, fmt, ## arg)
/* ------------------------------------------------------------------
Basic structures
------------------------------------------------------------------*/
struct vivi_fmt {
const char *name;
u32 fourcc; /* v4l2 format id */
u8 depth;
bool is_yuv;
};
static const struct vivi_fmt formats[] = {
{
.name = "4:2:2, packed, YUYV",
.fourcc = V4L2_PIX_FMT_YUYV,
.depth = 16,
.is_yuv = true,
},
{
.name = "4:2:2, packed, UYVY",
.fourcc = V4L2_PIX_FMT_UYVY,
.depth = 16,
.is_yuv = true,
},
{
.name = "4:2:2, packed, YVYU",
.fourcc = V4L2_PIX_FMT_YVYU,
.depth = 16,
.is_yuv = true,
},
{
.name = "4:2:2, packed, VYUY",
.fourcc = V4L2_PIX_FMT_VYUY,
.depth = 16,
.is_yuv = true,
},
{
.name = "RGB565 (LE)",
.fourcc = V4L2_PIX_FMT_RGB565, /* gggbbbbb rrrrrggg */
.depth = 16,
},
{
.name = "RGB565 (BE)",
.fourcc = V4L2_PIX_FMT_RGB565X, /* rrrrrggg gggbbbbb */
.depth = 16,
},
{
.name = "RGB555 (LE)",
.fourcc = V4L2_PIX_FMT_RGB555, /* gggbbbbb arrrrrgg */
.depth = 16,
},
{
.name = "RGB555 (BE)",
.fourcc = V4L2_PIX_FMT_RGB555X, /* arrrrrgg gggbbbbb */
.depth = 16,
},
{
.name = "RGB24 (LE)",
.fourcc = V4L2_PIX_FMT_RGB24, /* rgb */
.depth = 24,
},
{
.name = "RGB24 (BE)",
.fourcc = V4L2_PIX_FMT_BGR24, /* bgr */
.depth = 24,
},
{
.name = "RGB32 (LE)",
.fourcc = V4L2_PIX_FMT_RGB32, /* argb */
.depth = 32,
},
{
.name = "RGB32 (BE)",
.fourcc = V4L2_PIX_FMT_BGR32, /* bgra */
.depth = 32,
},
};
static const struct vivi_fmt *__get_format(u32 pixelformat)
{
const struct vivi_fmt *fmt;
unsigned int k;
for (k = 0; k < ARRAY_SIZE(formats); k++) {
fmt = &formats[k];
if (fmt->fourcc == pixelformat)
break;
}
if (k == ARRAY_SIZE(formats))
return NULL;
return &formats[k];
}
static const struct vivi_fmt *get_format(struct v4l2_format *f)
{
return __get_format(f->fmt.pix.pixelformat);
}
/* buffer for one video frame */
struct vivi_buffer {
/* common v4l buffer stuff -- must be first */
struct vb2_buffer vb;
struct list_head list;
};
struct vivi_dmaqueue {
struct list_head active;
/* thread for generating video stream*/
struct task_struct *kthread;
wait_queue_head_t wq;
/* Counters to control fps rate */
int frame;
int ini_jiffies;
};
static LIST_HEAD(vivi_devlist);
struct vivi_dev {
struct list_head vivi_devlist;
struct v4l2_device v4l2_dev;
struct v4l2_ctrl_handler ctrl_handler;
struct video_device vdev;
/* controls */
struct v4l2_ctrl *brightness;
struct v4l2_ctrl *contrast;
struct v4l2_ctrl *saturation;
struct v4l2_ctrl *hue;
struct {
/* autogain/gain cluster */
struct v4l2_ctrl *autogain;
struct v4l2_ctrl *gain;
};
struct v4l2_ctrl *volume;
struct v4l2_ctrl *alpha;
struct v4l2_ctrl *button;
struct v4l2_ctrl *boolean;
struct v4l2_ctrl *int32;
struct v4l2_ctrl *int64;
struct v4l2_ctrl *menu;
struct v4l2_ctrl *string;
struct v4l2_ctrl *bitmask;
struct v4l2_ctrl *int_menu;
spinlock_t slock;
struct mutex mutex;
struct vivi_dmaqueue vidq;
/* Several counters */
unsigned ms;
unsigned long jiffies;
unsigned button_pressed;
int mv_count; /* Controls bars movement */
/* Input Number */
int input;
/* video capture */
const struct vivi_fmt *fmt;
struct v4l2_fract timeperframe;
unsigned int width, height;
struct vb2_queue vb_vidq;
unsigned int seq_count;
u8 bars[9][3];
[media] vivi: vivi_dev->line[] was not aligned Though dev->line[] is u8 array we work with it as with u16, u24 or u32 pixels, and also pass it to memcpy() and it's better to align it to at least 4. Before the patch, on x86 offsetof(vivi_dev, line) was 1003 and after patch it is 1004. There is slight performance increase, but I think is is slight, only because we start copying not from line[0]: ---- 8< ---- drivers/media/platform/vivi.c static void vivi_fillbuff(struct vivi_dev *dev, struct vivi_buffer *buf) { ... for (h = 0; h < hmax; h++) memcpy(vbuf + h * wmax * dev->pixelsize, dev->line + (dev->mv_count % wmax) * dev->pixelsize, wmax * dev->pixelsize); before: # cmdline : /home/kirr/local/perf/bin/perf record -g -a sleep 20 # # Samples: 49K of event 'cycles' # Event count (approx.): 16799780016 # # Overhead Command Shared Object # ........ ............... .................... # 27.51% rawv libc-2.13.so [.] __memcpy_ssse3 23.77% vivi-* [kernel.kallsyms] [k] memcpy 9.96% Xorg [unknown] [.] 0xa76f5e12 4.94% vivi-* [vivi] [k] gen_text.constprop.6 4.44% rawv [vivi] [k] gen_twopix 3.17% vivi-* [vivi] [k] vivi_fillbuff 2.45% rawv [vivi] [k] precalculate_line 1.20% swapper [kernel.kallsyms] [k] read_hpet 23.77% vivi-* [kernel.kallsyms] [k] memcpy | --- memcpy | |--99.28%-- vivi_fillbuff | vivi_thread | kthread | ret_from_kernel_thread --0.72%-- [...] after: # cmdline : /home/kirr/local/perf/bin/perf record -g -a sleep 20 # # Samples: 49K of event 'cycles' # Event count (approx.): 16475832370 # # Overhead Command Shared Object # ........ ............... ...................... # 29.07% rawv libc-2.13.so [.] __memcpy_ssse3 20.57% vivi-* [kernel.kallsyms] [k] memcpy 10.20% Xorg [unknown] [.] 0xa7301494 5.16% vivi-* [vivi] [k] gen_text.constprop.6 4.43% rawv [vivi] [k] gen_twopix 4.36% vivi-* [vivi] [k] vivi_fillbuff 2.42% rawv [vivi] [k] precalculate_line 1.33% swapper [kernel.kallsyms] [k] read_hpet Signed-off-by: Kirill Smelkov <kirr@mns.spb.ru> Acked-by: Hans Verkuil <hans.verkuil@cisco.com> Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
2012-11-02 12:10:31 +00:00
u8 line[MAX_WIDTH * 8] __attribute__((__aligned__(4)));
unsigned int pixelsize;
u8 alpha_component;
[media] vivi: Optimize gen_text() I've noticed that vivi takes a lot of CPU to produce its frames. For example for 8 devices and 8 simple programs running, where each captures YUY2 640x480 and displays it to X via SDL, profile timing is as follows: # cmdline : /home/kirr/local/perf/bin/perf record -g -a sleep 20 # Samples: 82K of event 'cycles' # Event count (approx.): 31551930117 # # Overhead Command Shared Object Symbol # ........ ............... .................... # 49.48% vivi-* [vivi] [k] gen_twopix 10.79% vivi-* [kernel.kallsyms] [k] memcpy 10.02% rawv libc-2.13.so [.] __memcpy_ssse3 8.35% vivi-* [vivi] [k] gen_text.constprop.6 5.06% Xorg [unknown] [.] 0xa73015f8 2.32% rawv [vivi] [k] gen_twopix 1.22% rawv [vivi] [k] precalculate_line 1.20% vivi-* [vivi] [k] vivi_fillbuff (rawv is display program, vivi-* is a combination of vivi-000 through vivi-007) so a lot of time is spent in gen_twopix() which as the follwing call-graph profile shows ... 49.48% vivi-* [vivi] [k] gen_twopix | --- gen_twopix | |--96.30%-- gen_text.constprop.6 | vivi_fillbuff | vivi_thread | kthread | ret_from_kernel_thread | --3.70%-- vivi_fillbuff vivi_thread kthread ret_from_kernel_thread ... is called mostly from gen_text(). If we'll look at gen_text(), in the inner loop, we'll see if (chr & (1 << (7 - i))) gen_twopix(dev, pos + j * dev->pixelsize, WHITE, (x+y) & 1); else gen_twopix(dev, pos + j * dev->pixelsize, TEXT_BLACK, (x+y) & 1); which calls gen_twopix() for every character pixel, and that is very expensive, because gen_twopix() branches several times. Now, let's note, that we operate on only two colors - WHITE and TEXT_BLACK, and that pixel for that colors could be precomputed and gen_twopix() moved out of the inner loop. Also note, that for black and white colors even/odd does not make a difference for all supported pixel formats, so we could stop doing that `odd` gen_twopix() parameter game. So the first thing we are doing here is 1) moving gen_twopix() calls out of gen_text() into vivi_fillbuff(), to pregenerate black and white colors, just before printing starts. what we have next is that gen_text's font rendering loop, even with gen_twopix() calls moved out, was inefficient and branchy, so let's 2) rewrite gen_text() loop so it uses less variables + unroll char horizontal-rendering loop + instantiate 3 code paths for pixelsizes 2,3 and 4 so that in all inner loops we don't have to branch or make indirections (*). Done all above reworks, for gen_text() we get nice, non-branchy streamlined code (showing loop for pixelsize=2): ? cmp $0x2,%eax ? ? jne 26 ? mov -0x18(%ebp),%eax ? mov -0x20(%ebp),%edi ? imul -0x20(%ebp),%eax ? movzwl 0x3ffc(%ebx),%esi 0,08 ? movzwl 0x4000(%ebx),%ecx 0,04 ? add %edi,%edi ? mov 0x0,%ebx 0,51 ? mov %edi,-0x1c(%ebp) ? mov %ebx,-0x14(%ebp) ? movl $0x0,-0x10(%ebp) ? lea 0x20(%edx,%eax,2),%eax ? mov %eax,-0x18(%ebp) ? xchg %ax,%ax 0,04 ? a0: mov 0x8(%ebp),%ebx ? mov -0x18(%ebp),%eax 0,04 ? movzbl (%ebx),%edx 0,16 ? test %dl,%dl 0,04 ? ? je 128 0,08 ? lea 0x0(%esi),%esi 1,61 ? b0:???shl $0x4,%edx 1,02 ? ? mov -0x14(%ebp),%edi 2,04 ? ? add -0x10(%ebp),%edx 2,24 ? ? lea 0x1(%ebx),%ebx 0,27 ? ? movzbl (%edi,%edx,1),%edx 9,92 ? ? mov %esi,%edi 0,39 ? ? test %dl,%dl 2,04 ? ? cmovns %ecx,%edi 4,63 ? ? test $0x40,%dl 0,55 ? ? mov %di,(%eax) 3,76 ? ? mov %esi,%edi 0,71 ? ? cmove %ecx,%edi 3,41 ? ? test $0x20,%dl 0,75 ? ? mov %di,0x2(%eax) 2,43 ? ? mov %esi,%edi 0,59 ? ? cmove %ecx,%edi 4,59 ? ? test $0x10,%dl 0,67 ? ? mov %di,0x4(%eax) 2,55 ? ? mov %esi,%edi 0,78 ? ? cmove %ecx,%edi 4,31 ? ? test $0x8,%dl 0,67 ? ? mov %di,0x6(%eax) 5,76 ? ? mov %esi,%edi 1,80 ? ? cmove %ecx,%edi 4,20 ? ? test $0x4,%dl 0,86 ? ? mov %di,0x8(%eax) 2,98 ? ? mov %esi,%edi 1,37 ? ? cmove %ecx,%edi 4,67 ? ? test $0x2,%dl 0,20 ? ? mov %di,0xa(%eax) 2,78 ? ? mov %esi,%edi 0,75 ? ? cmove %ecx,%edi 3,92 ? ? and $0x1,%edx 0,75 ? ? mov %esi,%edx 2,59 ? ? mov %di,0xc(%eax) 0,59 ? ? cmove %ecx,%edx 3,10 ? ? mov %dx,0xe(%eax) 2,39 ? ? add $0x10,%eax 0,51 ? ? movzbl (%ebx),%edx 2,86 ? ? test %dl,%dl 2,31 ? ???jne b0 0,04 ?128: addl $0x1,-0x10(%ebp) 4,00 ? mov -0x1c(%ebp),%eax 0,04 ? add %eax,-0x18(%ebp) 0,08 ? cmpl $0x10,-0x10(%ebp) ? ? jne a0 which almost goes away from the profile: # cmdline : /home/kirr/local/perf/bin/perf record -g -a sleep 20 # Samples: 49K of event 'cycles' # Event count (approx.): 16799780016 # # Overhead Command Shared Object Symbol # ........ ............... .................... # 27.51% rawv libc-2.13.so [.] __memcpy_ssse3 23.77% vivi-* [kernel.kallsyms] [k] memcpy 9.96% Xorg [unknown] [.] 0xa76f5e12 4.94% vivi-* [vivi] [k] gen_text.constprop.6 4.44% rawv [vivi] [k] gen_twopix 3.17% vivi-* [vivi] [k] vivi_fillbuff 2.45% rawv [vivi] [k] precalculate_line 1.20% swapper [kernel.kallsyms] [k] read_hpet i.e. gen_twopix() overhead dropped from 49% to 4% and gen_text() loops from ~8% to ~4%, and overal cycles count dropped from 31551930117 to 16799780016 which is ~1.9x whole workload speedup. (*) for RGB24 rendering I've introduced x24, which could be thought as synthetic u24 for simplifying the code. That's done because for memcpy used for conditional assignment, gcc generates suboptimal code with more indirections. Fortunately, in C struct assignment is builtin and that's all we need from pixeltype for font rendering. Signed-off-by: Kirill Smelkov <kirr@mns.spb.ru> Acked-by: Hans Verkuil <hans.verkuil@cisco.com> Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
2012-11-02 12:10:30 +00:00
u32 textfg, textbg;
};
/* ------------------------------------------------------------------
DMA and thread functions
------------------------------------------------------------------*/
/* Bars and Colors should match positions */
enum colors {
WHITE,
AMBER,
CYAN,
GREEN,
MAGENTA,
RED,
BLUE,
BLACK,
TEXT_BLACK,
};
/* R G B */
#define COLOR_WHITE {204, 204, 204}
#define COLOR_AMBER {208, 208, 0}
#define COLOR_CYAN { 0, 206, 206}
#define COLOR_GREEN { 0, 239, 0}
#define COLOR_MAGENTA {239, 0, 239}
#define COLOR_RED {205, 0, 0}
#define COLOR_BLUE { 0, 0, 255}
#define COLOR_BLACK { 0, 0, 0}
struct bar_std {
u8 bar[9][3];
};
/* Maximum number of bars are 10 - otherwise, the input print code
should be modified */
static const struct bar_std bars[] = {
{ /* Standard ITU-R color bar sequence */
{ COLOR_WHITE, COLOR_AMBER, COLOR_CYAN, COLOR_GREEN,
COLOR_MAGENTA, COLOR_RED, COLOR_BLUE, COLOR_BLACK, COLOR_BLACK }
}, {
{ COLOR_WHITE, COLOR_AMBER, COLOR_BLACK, COLOR_WHITE,
COLOR_AMBER, COLOR_BLACK, COLOR_WHITE, COLOR_AMBER, COLOR_BLACK }
}, {
{ COLOR_WHITE, COLOR_CYAN, COLOR_BLACK, COLOR_WHITE,
COLOR_CYAN, COLOR_BLACK, COLOR_WHITE, COLOR_CYAN, COLOR_BLACK }
}, {
{ COLOR_WHITE, COLOR_GREEN, COLOR_BLACK, COLOR_WHITE,
COLOR_GREEN, COLOR_BLACK, COLOR_WHITE, COLOR_GREEN, COLOR_BLACK }
},
};
#define NUM_INPUTS ARRAY_SIZE(bars)
#define TO_Y(r, g, b) \
(((16829 * r + 33039 * g + 6416 * b + 32768) >> 16) + 16)
/* RGB to V(Cr) Color transform */
#define TO_V(r, g, b) \
(((28784 * r - 24103 * g - 4681 * b + 32768) >> 16) + 128)
/* RGB to U(Cb) Color transform */
#define TO_U(r, g, b) \
(((-9714 * r - 19070 * g + 28784 * b + 32768) >> 16) + 128)
/* precalculate color bar values to speed up rendering */
static void precalculate_bars(struct vivi_dev *dev)
{
u8 r, g, b;
int k, is_yuv;
for (k = 0; k < 9; k++) {
r = bars[dev->input].bar[k][0];
g = bars[dev->input].bar[k][1];
b = bars[dev->input].bar[k][2];
is_yuv = dev->fmt->is_yuv;
switch (dev->fmt->fourcc) {
case V4L2_PIX_FMT_RGB565:
case V4L2_PIX_FMT_RGB565X:
r >>= 3;
g >>= 2;
b >>= 3;
break;
case V4L2_PIX_FMT_RGB555:
case V4L2_PIX_FMT_RGB555X:
r >>= 3;
g >>= 3;
b >>= 3;
break;
case V4L2_PIX_FMT_YUYV:
case V4L2_PIX_FMT_UYVY:
case V4L2_PIX_FMT_YVYU:
case V4L2_PIX_FMT_VYUY:
case V4L2_PIX_FMT_RGB24:
case V4L2_PIX_FMT_BGR24:
case V4L2_PIX_FMT_RGB32:
case V4L2_PIX_FMT_BGR32:
break;
}
if (is_yuv) {
dev->bars[k][0] = TO_Y(r, g, b); /* Luma */
dev->bars[k][1] = TO_U(r, g, b); /* Cb */
dev->bars[k][2] = TO_V(r, g, b); /* Cr */
} else {
dev->bars[k][0] = r;
dev->bars[k][1] = g;
dev->bars[k][2] = b;
}
}
}
/* 'odd' is true for pixels 1, 3, 5, etc. and false for pixels 0, 2, 4, etc. */
static void gen_twopix(struct vivi_dev *dev, u8 *buf, int colorpos, bool odd)
{
u8 r_y, g_u, b_v;
u8 alpha = dev->alpha_component;
int color;
u8 *p;
r_y = dev->bars[colorpos][0]; /* R or precalculated Y */
g_u = dev->bars[colorpos][1]; /* G or precalculated U */
b_v = dev->bars[colorpos][2]; /* B or precalculated V */
for (color = 0; color < dev->pixelsize; color++) {
p = buf + color;
switch (dev->fmt->fourcc) {
case V4L2_PIX_FMT_YUYV:
switch (color) {
case 0:
*p = r_y;
break;
case 1:
*p = odd ? b_v : g_u;
break;
}
break;
case V4L2_PIX_FMT_UYVY:
switch (color) {
case 0:
*p = odd ? b_v : g_u;
break;
case 1:
*p = r_y;
break;
}
break;
case V4L2_PIX_FMT_YVYU:
switch (color) {
case 0:
*p = r_y;
break;
case 1:
*p = odd ? g_u : b_v;
break;
}
break;
case V4L2_PIX_FMT_VYUY:
switch (color) {
case 0:
*p = odd ? g_u : b_v;
break;
case 1:
*p = r_y;
break;
}
break;
case V4L2_PIX_FMT_RGB565:
switch (color) {
case 0:
*p = (g_u << 5) | b_v;
break;
case 1:
*p = (r_y << 3) | (g_u >> 3);
break;
}
break;
case V4L2_PIX_FMT_RGB565X:
switch (color) {
case 0:
*p = (r_y << 3) | (g_u >> 3);
break;
case 1:
*p = (g_u << 5) | b_v;
break;
}
break;
case V4L2_PIX_FMT_RGB555:
switch (color) {
case 0:
*p = (g_u << 5) | b_v;
break;
case 1:
*p = (alpha & 0x80) | (r_y << 2) | (g_u >> 3);
break;
}
break;
case V4L2_PIX_FMT_RGB555X:
switch (color) {
case 0:
*p = (alpha & 0x80) | (r_y << 2) | (g_u >> 3);
break;
case 1:
*p = (g_u << 5) | b_v;
break;
}
break;
case V4L2_PIX_FMT_RGB24:
switch (color) {
case 0:
*p = r_y;
break;
case 1:
*p = g_u;
break;
case 2:
*p = b_v;
break;
}
break;
case V4L2_PIX_FMT_BGR24:
switch (color) {
case 0:
*p = b_v;
break;
case 1:
*p = g_u;
break;
case 2:
*p = r_y;
break;
}
break;
case V4L2_PIX_FMT_RGB32:
switch (color) {
case 0:
*p = alpha;
break;
case 1:
*p = r_y;
break;
case 2:
*p = g_u;
break;
case 3:
*p = b_v;
break;
}
break;
case V4L2_PIX_FMT_BGR32:
switch (color) {
case 0:
*p = b_v;
break;
case 1:
*p = g_u;
break;
case 2:
*p = r_y;
break;
case 3:
*p = alpha;
break;
}
break;
}
}
}
static void precalculate_line(struct vivi_dev *dev)
{
[media] vivi: Optimize precalculate_line() precalculate_line() is not very high on profile, but it calls expensive gen_twopix(), so let's polish it too: call gen_twopix() only once for every color bar and then distribute the result. before: # cmdline : /home/kirr/local/perf/bin/perf record -g -a sleep 20 # # Samples: 46K of event 'cycles' # Event count (approx.): 15574200568 # # Overhead Command Shared Object # ........ ............... .................... # 27.99% rawv libc-2.13.so [.] __memcpy_ssse3 23.29% vivi-* [kernel.kallsyms] [k] memcpy 10.30% Xorg [unknown] [.] 0xa75c98f8 5.34% vivi-* [vivi] [k] gen_text.constprop.6 4.61% rawv [vivi] [k] gen_twopix 2.64% rawv [vivi] [k] precalculate_line 1.37% swapper [kernel.kallsyms] [k] read_hpet after: # cmdline : /home/kirr/local/perf/bin/perf record -g -a sleep 20 # # Samples: 45K of event 'cycles' # Event count (approx.): 15561769214 # # Overhead Command Shared Object # ........ ............... .................... # 30.73% rawv libc-2.13.so [.] __memcpy_ssse3 26.78% vivi-* [kernel.kallsyms] [k] memcpy 10.68% Xorg [unknown] [.] 0xa73015e9 5.55% vivi-* [vivi] [k] gen_text.constprop.6 1.36% swapper [kernel.kallsyms] [k] read_hpet 0.96% Xorg [kernel.kallsyms] [k] read_hpet ... 0.16% rawv [vivi] [k] precalculate_line ... 0.14% rawv [vivi] [k] gen_twopix (i.e. gen_twopix and precalculate_line overheads are almost gone) Signed-off-by: Kirill Smelkov <kirr@mns.spb.ru> Acked-by: Hans Verkuil <hans.verkuil@cisco.com> Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
2012-11-02 12:10:33 +00:00
unsigned pixsize = dev->pixelsize;
unsigned pixsize2 = 2*pixsize;
int colorpos;
u8 *pos;
for (colorpos = 0; colorpos < 16; ++colorpos) {
u8 pix[8];
int wstart = colorpos * dev->width / 8;
int wend = (colorpos+1) * dev->width / 8;
int w;
gen_twopix(dev, &pix[0], colorpos % 8, 0);
gen_twopix(dev, &pix[pixsize], colorpos % 8, 1);
for (w = wstart/2*2, pos = dev->line + w*pixsize; w < wend; w += 2, pos += pixsize2)
memcpy(pos, pix, pixsize2);
}
}
[media] vivi: Optimize gen_text() I've noticed that vivi takes a lot of CPU to produce its frames. For example for 8 devices and 8 simple programs running, where each captures YUY2 640x480 and displays it to X via SDL, profile timing is as follows: # cmdline : /home/kirr/local/perf/bin/perf record -g -a sleep 20 # Samples: 82K of event 'cycles' # Event count (approx.): 31551930117 # # Overhead Command Shared Object Symbol # ........ ............... .................... # 49.48% vivi-* [vivi] [k] gen_twopix 10.79% vivi-* [kernel.kallsyms] [k] memcpy 10.02% rawv libc-2.13.so [.] __memcpy_ssse3 8.35% vivi-* [vivi] [k] gen_text.constprop.6 5.06% Xorg [unknown] [.] 0xa73015f8 2.32% rawv [vivi] [k] gen_twopix 1.22% rawv [vivi] [k] precalculate_line 1.20% vivi-* [vivi] [k] vivi_fillbuff (rawv is display program, vivi-* is a combination of vivi-000 through vivi-007) so a lot of time is spent in gen_twopix() which as the follwing call-graph profile shows ... 49.48% vivi-* [vivi] [k] gen_twopix | --- gen_twopix | |--96.30%-- gen_text.constprop.6 | vivi_fillbuff | vivi_thread | kthread | ret_from_kernel_thread | --3.70%-- vivi_fillbuff vivi_thread kthread ret_from_kernel_thread ... is called mostly from gen_text(). If we'll look at gen_text(), in the inner loop, we'll see if (chr & (1 << (7 - i))) gen_twopix(dev, pos + j * dev->pixelsize, WHITE, (x+y) & 1); else gen_twopix(dev, pos + j * dev->pixelsize, TEXT_BLACK, (x+y) & 1); which calls gen_twopix() for every character pixel, and that is very expensive, because gen_twopix() branches several times. Now, let's note, that we operate on only two colors - WHITE and TEXT_BLACK, and that pixel for that colors could be precomputed and gen_twopix() moved out of the inner loop. Also note, that for black and white colors even/odd does not make a difference for all supported pixel formats, so we could stop doing that `odd` gen_twopix() parameter game. So the first thing we are doing here is 1) moving gen_twopix() calls out of gen_text() into vivi_fillbuff(), to pregenerate black and white colors, just before printing starts. what we have next is that gen_text's font rendering loop, even with gen_twopix() calls moved out, was inefficient and branchy, so let's 2) rewrite gen_text() loop so it uses less variables + unroll char horizontal-rendering loop + instantiate 3 code paths for pixelsizes 2,3 and 4 so that in all inner loops we don't have to branch or make indirections (*). Done all above reworks, for gen_text() we get nice, non-branchy streamlined code (showing loop for pixelsize=2): ? cmp $0x2,%eax ? ? jne 26 ? mov -0x18(%ebp),%eax ? mov -0x20(%ebp),%edi ? imul -0x20(%ebp),%eax ? movzwl 0x3ffc(%ebx),%esi 0,08 ? movzwl 0x4000(%ebx),%ecx 0,04 ? add %edi,%edi ? mov 0x0,%ebx 0,51 ? mov %edi,-0x1c(%ebp) ? mov %ebx,-0x14(%ebp) ? movl $0x0,-0x10(%ebp) ? lea 0x20(%edx,%eax,2),%eax ? mov %eax,-0x18(%ebp) ? xchg %ax,%ax 0,04 ? a0: mov 0x8(%ebp),%ebx ? mov -0x18(%ebp),%eax 0,04 ? movzbl (%ebx),%edx 0,16 ? test %dl,%dl 0,04 ? ? je 128 0,08 ? lea 0x0(%esi),%esi 1,61 ? b0:???shl $0x4,%edx 1,02 ? ? mov -0x14(%ebp),%edi 2,04 ? ? add -0x10(%ebp),%edx 2,24 ? ? lea 0x1(%ebx),%ebx 0,27 ? ? movzbl (%edi,%edx,1),%edx 9,92 ? ? mov %esi,%edi 0,39 ? ? test %dl,%dl 2,04 ? ? cmovns %ecx,%edi 4,63 ? ? test $0x40,%dl 0,55 ? ? mov %di,(%eax) 3,76 ? ? mov %esi,%edi 0,71 ? ? cmove %ecx,%edi 3,41 ? ? test $0x20,%dl 0,75 ? ? mov %di,0x2(%eax) 2,43 ? ? mov %esi,%edi 0,59 ? ? cmove %ecx,%edi 4,59 ? ? test $0x10,%dl 0,67 ? ? mov %di,0x4(%eax) 2,55 ? ? mov %esi,%edi 0,78 ? ? cmove %ecx,%edi 4,31 ? ? test $0x8,%dl 0,67 ? ? mov %di,0x6(%eax) 5,76 ? ? mov %esi,%edi 1,80 ? ? cmove %ecx,%edi 4,20 ? ? test $0x4,%dl 0,86 ? ? mov %di,0x8(%eax) 2,98 ? ? mov %esi,%edi 1,37 ? ? cmove %ecx,%edi 4,67 ? ? test $0x2,%dl 0,20 ? ? mov %di,0xa(%eax) 2,78 ? ? mov %esi,%edi 0,75 ? ? cmove %ecx,%edi 3,92 ? ? and $0x1,%edx 0,75 ? ? mov %esi,%edx 2,59 ? ? mov %di,0xc(%eax) 0,59 ? ? cmove %ecx,%edx 3,10 ? ? mov %dx,0xe(%eax) 2,39 ? ? add $0x10,%eax 0,51 ? ? movzbl (%ebx),%edx 2,86 ? ? test %dl,%dl 2,31 ? ???jne b0 0,04 ?128: addl $0x1,-0x10(%ebp) 4,00 ? mov -0x1c(%ebp),%eax 0,04 ? add %eax,-0x18(%ebp) 0,08 ? cmpl $0x10,-0x10(%ebp) ? ? jne a0 which almost goes away from the profile: # cmdline : /home/kirr/local/perf/bin/perf record -g -a sleep 20 # Samples: 49K of event 'cycles' # Event count (approx.): 16799780016 # # Overhead Command Shared Object Symbol # ........ ............... .................... # 27.51% rawv libc-2.13.so [.] __memcpy_ssse3 23.77% vivi-* [kernel.kallsyms] [k] memcpy 9.96% Xorg [unknown] [.] 0xa76f5e12 4.94% vivi-* [vivi] [k] gen_text.constprop.6 4.44% rawv [vivi] [k] gen_twopix 3.17% vivi-* [vivi] [k] vivi_fillbuff 2.45% rawv [vivi] [k] precalculate_line 1.20% swapper [kernel.kallsyms] [k] read_hpet i.e. gen_twopix() overhead dropped from 49% to 4% and gen_text() loops from ~8% to ~4%, and overal cycles count dropped from 31551930117 to 16799780016 which is ~1.9x whole workload speedup. (*) for RGB24 rendering I've introduced x24, which could be thought as synthetic u24 for simplifying the code. That's done because for memcpy used for conditional assignment, gcc generates suboptimal code with more indirections. Fortunately, in C struct assignment is builtin and that's all we need from pixeltype for font rendering. Signed-off-by: Kirill Smelkov <kirr@mns.spb.ru> Acked-by: Hans Verkuil <hans.verkuil@cisco.com> Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
2012-11-02 12:10:30 +00:00
/* need this to do rgb24 rendering */
typedef struct { u16 __; u8 _; } __attribute__((packed)) x24;
static void gen_text(struct vivi_dev *dev, char *basep,
int y, int x, char *text)
{
int line;
[media] vivi: Optimize gen_text() I've noticed that vivi takes a lot of CPU to produce its frames. For example for 8 devices and 8 simple programs running, where each captures YUY2 640x480 and displays it to X via SDL, profile timing is as follows: # cmdline : /home/kirr/local/perf/bin/perf record -g -a sleep 20 # Samples: 82K of event 'cycles' # Event count (approx.): 31551930117 # # Overhead Command Shared Object Symbol # ........ ............... .................... # 49.48% vivi-* [vivi] [k] gen_twopix 10.79% vivi-* [kernel.kallsyms] [k] memcpy 10.02% rawv libc-2.13.so [.] __memcpy_ssse3 8.35% vivi-* [vivi] [k] gen_text.constprop.6 5.06% Xorg [unknown] [.] 0xa73015f8 2.32% rawv [vivi] [k] gen_twopix 1.22% rawv [vivi] [k] precalculate_line 1.20% vivi-* [vivi] [k] vivi_fillbuff (rawv is display program, vivi-* is a combination of vivi-000 through vivi-007) so a lot of time is spent in gen_twopix() which as the follwing call-graph profile shows ... 49.48% vivi-* [vivi] [k] gen_twopix | --- gen_twopix | |--96.30%-- gen_text.constprop.6 | vivi_fillbuff | vivi_thread | kthread | ret_from_kernel_thread | --3.70%-- vivi_fillbuff vivi_thread kthread ret_from_kernel_thread ... is called mostly from gen_text(). If we'll look at gen_text(), in the inner loop, we'll see if (chr & (1 << (7 - i))) gen_twopix(dev, pos + j * dev->pixelsize, WHITE, (x+y) & 1); else gen_twopix(dev, pos + j * dev->pixelsize, TEXT_BLACK, (x+y) & 1); which calls gen_twopix() for every character pixel, and that is very expensive, because gen_twopix() branches several times. Now, let's note, that we operate on only two colors - WHITE and TEXT_BLACK, and that pixel for that colors could be precomputed and gen_twopix() moved out of the inner loop. Also note, that for black and white colors even/odd does not make a difference for all supported pixel formats, so we could stop doing that `odd` gen_twopix() parameter game. So the first thing we are doing here is 1) moving gen_twopix() calls out of gen_text() into vivi_fillbuff(), to pregenerate black and white colors, just before printing starts. what we have next is that gen_text's font rendering loop, even with gen_twopix() calls moved out, was inefficient and branchy, so let's 2) rewrite gen_text() loop so it uses less variables + unroll char horizontal-rendering loop + instantiate 3 code paths for pixelsizes 2,3 and 4 so that in all inner loops we don't have to branch or make indirections (*). Done all above reworks, for gen_text() we get nice, non-branchy streamlined code (showing loop for pixelsize=2): ? cmp $0x2,%eax ? ? jne 26 ? mov -0x18(%ebp),%eax ? mov -0x20(%ebp),%edi ? imul -0x20(%ebp),%eax ? movzwl 0x3ffc(%ebx),%esi 0,08 ? movzwl 0x4000(%ebx),%ecx 0,04 ? add %edi,%edi ? mov 0x0,%ebx 0,51 ? mov %edi,-0x1c(%ebp) ? mov %ebx,-0x14(%ebp) ? movl $0x0,-0x10(%ebp) ? lea 0x20(%edx,%eax,2),%eax ? mov %eax,-0x18(%ebp) ? xchg %ax,%ax 0,04 ? a0: mov 0x8(%ebp),%ebx ? mov -0x18(%ebp),%eax 0,04 ? movzbl (%ebx),%edx 0,16 ? test %dl,%dl 0,04 ? ? je 128 0,08 ? lea 0x0(%esi),%esi 1,61 ? b0:???shl $0x4,%edx 1,02 ? ? mov -0x14(%ebp),%edi 2,04 ? ? add -0x10(%ebp),%edx 2,24 ? ? lea 0x1(%ebx),%ebx 0,27 ? ? movzbl (%edi,%edx,1),%edx 9,92 ? ? mov %esi,%edi 0,39 ? ? test %dl,%dl 2,04 ? ? cmovns %ecx,%edi 4,63 ? ? test $0x40,%dl 0,55 ? ? mov %di,(%eax) 3,76 ? ? mov %esi,%edi 0,71 ? ? cmove %ecx,%edi 3,41 ? ? test $0x20,%dl 0,75 ? ? mov %di,0x2(%eax) 2,43 ? ? mov %esi,%edi 0,59 ? ? cmove %ecx,%edi 4,59 ? ? test $0x10,%dl 0,67 ? ? mov %di,0x4(%eax) 2,55 ? ? mov %esi,%edi 0,78 ? ? cmove %ecx,%edi 4,31 ? ? test $0x8,%dl 0,67 ? ? mov %di,0x6(%eax) 5,76 ? ? mov %esi,%edi 1,80 ? ? cmove %ecx,%edi 4,20 ? ? test $0x4,%dl 0,86 ? ? mov %di,0x8(%eax) 2,98 ? ? mov %esi,%edi 1,37 ? ? cmove %ecx,%edi 4,67 ? ? test $0x2,%dl 0,20 ? ? mov %di,0xa(%eax) 2,78 ? ? mov %esi,%edi 0,75 ? ? cmove %ecx,%edi 3,92 ? ? and $0x1,%edx 0,75 ? ? mov %esi,%edx 2,59 ? ? mov %di,0xc(%eax) 0,59 ? ? cmove %ecx,%edx 3,10 ? ? mov %dx,0xe(%eax) 2,39 ? ? add $0x10,%eax 0,51 ? ? movzbl (%ebx),%edx 2,86 ? ? test %dl,%dl 2,31 ? ???jne b0 0,04 ?128: addl $0x1,-0x10(%ebp) 4,00 ? mov -0x1c(%ebp),%eax 0,04 ? add %eax,-0x18(%ebp) 0,08 ? cmpl $0x10,-0x10(%ebp) ? ? jne a0 which almost goes away from the profile: # cmdline : /home/kirr/local/perf/bin/perf record -g -a sleep 20 # Samples: 49K of event 'cycles' # Event count (approx.): 16799780016 # # Overhead Command Shared Object Symbol # ........ ............... .................... # 27.51% rawv libc-2.13.so [.] __memcpy_ssse3 23.77% vivi-* [kernel.kallsyms] [k] memcpy 9.96% Xorg [unknown] [.] 0xa76f5e12 4.94% vivi-* [vivi] [k] gen_text.constprop.6 4.44% rawv [vivi] [k] gen_twopix 3.17% vivi-* [vivi] [k] vivi_fillbuff 2.45% rawv [vivi] [k] precalculate_line 1.20% swapper [kernel.kallsyms] [k] read_hpet i.e. gen_twopix() overhead dropped from 49% to 4% and gen_text() loops from ~8% to ~4%, and overal cycles count dropped from 31551930117 to 16799780016 which is ~1.9x whole workload speedup. (*) for RGB24 rendering I've introduced x24, which could be thought as synthetic u24 for simplifying the code. That's done because for memcpy used for conditional assignment, gcc generates suboptimal code with more indirections. Fortunately, in C struct assignment is builtin and that's all we need from pixeltype for font rendering. Signed-off-by: Kirill Smelkov <kirr@mns.spb.ru> Acked-by: Hans Verkuil <hans.verkuil@cisco.com> Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
2012-11-02 12:10:30 +00:00
unsigned int width = dev->width;
/* Checks if it is possible to show string */
[media] vivi: Optimize gen_text() I've noticed that vivi takes a lot of CPU to produce its frames. For example for 8 devices and 8 simple programs running, where each captures YUY2 640x480 and displays it to X via SDL, profile timing is as follows: # cmdline : /home/kirr/local/perf/bin/perf record -g -a sleep 20 # Samples: 82K of event 'cycles' # Event count (approx.): 31551930117 # # Overhead Command Shared Object Symbol # ........ ............... .................... # 49.48% vivi-* [vivi] [k] gen_twopix 10.79% vivi-* [kernel.kallsyms] [k] memcpy 10.02% rawv libc-2.13.so [.] __memcpy_ssse3 8.35% vivi-* [vivi] [k] gen_text.constprop.6 5.06% Xorg [unknown] [.] 0xa73015f8 2.32% rawv [vivi] [k] gen_twopix 1.22% rawv [vivi] [k] precalculate_line 1.20% vivi-* [vivi] [k] vivi_fillbuff (rawv is display program, vivi-* is a combination of vivi-000 through vivi-007) so a lot of time is spent in gen_twopix() which as the follwing call-graph profile shows ... 49.48% vivi-* [vivi] [k] gen_twopix | --- gen_twopix | |--96.30%-- gen_text.constprop.6 | vivi_fillbuff | vivi_thread | kthread | ret_from_kernel_thread | --3.70%-- vivi_fillbuff vivi_thread kthread ret_from_kernel_thread ... is called mostly from gen_text(). If we'll look at gen_text(), in the inner loop, we'll see if (chr & (1 << (7 - i))) gen_twopix(dev, pos + j * dev->pixelsize, WHITE, (x+y) & 1); else gen_twopix(dev, pos + j * dev->pixelsize, TEXT_BLACK, (x+y) & 1); which calls gen_twopix() for every character pixel, and that is very expensive, because gen_twopix() branches several times. Now, let's note, that we operate on only two colors - WHITE and TEXT_BLACK, and that pixel for that colors could be precomputed and gen_twopix() moved out of the inner loop. Also note, that for black and white colors even/odd does not make a difference for all supported pixel formats, so we could stop doing that `odd` gen_twopix() parameter game. So the first thing we are doing here is 1) moving gen_twopix() calls out of gen_text() into vivi_fillbuff(), to pregenerate black and white colors, just before printing starts. what we have next is that gen_text's font rendering loop, even with gen_twopix() calls moved out, was inefficient and branchy, so let's 2) rewrite gen_text() loop so it uses less variables + unroll char horizontal-rendering loop + instantiate 3 code paths for pixelsizes 2,3 and 4 so that in all inner loops we don't have to branch or make indirections (*). Done all above reworks, for gen_text() we get nice, non-branchy streamlined code (showing loop for pixelsize=2): ? cmp $0x2,%eax ? ? jne 26 ? mov -0x18(%ebp),%eax ? mov -0x20(%ebp),%edi ? imul -0x20(%ebp),%eax ? movzwl 0x3ffc(%ebx),%esi 0,08 ? movzwl 0x4000(%ebx),%ecx 0,04 ? add %edi,%edi ? mov 0x0,%ebx 0,51 ? mov %edi,-0x1c(%ebp) ? mov %ebx,-0x14(%ebp) ? movl $0x0,-0x10(%ebp) ? lea 0x20(%edx,%eax,2),%eax ? mov %eax,-0x18(%ebp) ? xchg %ax,%ax 0,04 ? a0: mov 0x8(%ebp),%ebx ? mov -0x18(%ebp),%eax 0,04 ? movzbl (%ebx),%edx 0,16 ? test %dl,%dl 0,04 ? ? je 128 0,08 ? lea 0x0(%esi),%esi 1,61 ? b0:???shl $0x4,%edx 1,02 ? ? mov -0x14(%ebp),%edi 2,04 ? ? add -0x10(%ebp),%edx 2,24 ? ? lea 0x1(%ebx),%ebx 0,27 ? ? movzbl (%edi,%edx,1),%edx 9,92 ? ? mov %esi,%edi 0,39 ? ? test %dl,%dl 2,04 ? ? cmovns %ecx,%edi 4,63 ? ? test $0x40,%dl 0,55 ? ? mov %di,(%eax) 3,76 ? ? mov %esi,%edi 0,71 ? ? cmove %ecx,%edi 3,41 ? ? test $0x20,%dl 0,75 ? ? mov %di,0x2(%eax) 2,43 ? ? mov %esi,%edi 0,59 ? ? cmove %ecx,%edi 4,59 ? ? test $0x10,%dl 0,67 ? ? mov %di,0x4(%eax) 2,55 ? ? mov %esi,%edi 0,78 ? ? cmove %ecx,%edi 4,31 ? ? test $0x8,%dl 0,67 ? ? mov %di,0x6(%eax) 5,76 ? ? mov %esi,%edi 1,80 ? ? cmove %ecx,%edi 4,20 ? ? test $0x4,%dl 0,86 ? ? mov %di,0x8(%eax) 2,98 ? ? mov %esi,%edi 1,37 ? ? cmove %ecx,%edi 4,67 ? ? test $0x2,%dl 0,20 ? ? mov %di,0xa(%eax) 2,78 ? ? mov %esi,%edi 0,75 ? ? cmove %ecx,%edi 3,92 ? ? and $0x1,%edx 0,75 ? ? mov %esi,%edx 2,59 ? ? mov %di,0xc(%eax) 0,59 ? ? cmove %ecx,%edx 3,10 ? ? mov %dx,0xe(%eax) 2,39 ? ? add $0x10,%eax 0,51 ? ? movzbl (%ebx),%edx 2,86 ? ? test %dl,%dl 2,31 ? ???jne b0 0,04 ?128: addl $0x1,-0x10(%ebp) 4,00 ? mov -0x1c(%ebp),%eax 0,04 ? add %eax,-0x18(%ebp) 0,08 ? cmpl $0x10,-0x10(%ebp) ? ? jne a0 which almost goes away from the profile: # cmdline : /home/kirr/local/perf/bin/perf record -g -a sleep 20 # Samples: 49K of event 'cycles' # Event count (approx.): 16799780016 # # Overhead Command Shared Object Symbol # ........ ............... .................... # 27.51% rawv libc-2.13.so [.] __memcpy_ssse3 23.77% vivi-* [kernel.kallsyms] [k] memcpy 9.96% Xorg [unknown] [.] 0xa76f5e12 4.94% vivi-* [vivi] [k] gen_text.constprop.6 4.44% rawv [vivi] [k] gen_twopix 3.17% vivi-* [vivi] [k] vivi_fillbuff 2.45% rawv [vivi] [k] precalculate_line 1.20% swapper [kernel.kallsyms] [k] read_hpet i.e. gen_twopix() overhead dropped from 49% to 4% and gen_text() loops from ~8% to ~4%, and overal cycles count dropped from 31551930117 to 16799780016 which is ~1.9x whole workload speedup. (*) for RGB24 rendering I've introduced x24, which could be thought as synthetic u24 for simplifying the code. That's done because for memcpy used for conditional assignment, gcc generates suboptimal code with more indirections. Fortunately, in C struct assignment is builtin and that's all we need from pixeltype for font rendering. Signed-off-by: Kirill Smelkov <kirr@mns.spb.ru> Acked-by: Hans Verkuil <hans.verkuil@cisco.com> Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
2012-11-02 12:10:30 +00:00
if (y + 16 >= dev->height || x + strlen(text) * 8 >= width)
return;
/* Print stream time */
[media] vivi: Optimize gen_text() I've noticed that vivi takes a lot of CPU to produce its frames. For example for 8 devices and 8 simple programs running, where each captures YUY2 640x480 and displays it to X via SDL, profile timing is as follows: # cmdline : /home/kirr/local/perf/bin/perf record -g -a sleep 20 # Samples: 82K of event 'cycles' # Event count (approx.): 31551930117 # # Overhead Command Shared Object Symbol # ........ ............... .................... # 49.48% vivi-* [vivi] [k] gen_twopix 10.79% vivi-* [kernel.kallsyms] [k] memcpy 10.02% rawv libc-2.13.so [.] __memcpy_ssse3 8.35% vivi-* [vivi] [k] gen_text.constprop.6 5.06% Xorg [unknown] [.] 0xa73015f8 2.32% rawv [vivi] [k] gen_twopix 1.22% rawv [vivi] [k] precalculate_line 1.20% vivi-* [vivi] [k] vivi_fillbuff (rawv is display program, vivi-* is a combination of vivi-000 through vivi-007) so a lot of time is spent in gen_twopix() which as the follwing call-graph profile shows ... 49.48% vivi-* [vivi] [k] gen_twopix | --- gen_twopix | |--96.30%-- gen_text.constprop.6 | vivi_fillbuff | vivi_thread | kthread | ret_from_kernel_thread | --3.70%-- vivi_fillbuff vivi_thread kthread ret_from_kernel_thread ... is called mostly from gen_text(). If we'll look at gen_text(), in the inner loop, we'll see if (chr & (1 << (7 - i))) gen_twopix(dev, pos + j * dev->pixelsize, WHITE, (x+y) & 1); else gen_twopix(dev, pos + j * dev->pixelsize, TEXT_BLACK, (x+y) & 1); which calls gen_twopix() for every character pixel, and that is very expensive, because gen_twopix() branches several times. Now, let's note, that we operate on only two colors - WHITE and TEXT_BLACK, and that pixel for that colors could be precomputed and gen_twopix() moved out of the inner loop. Also note, that for black and white colors even/odd does not make a difference for all supported pixel formats, so we could stop doing that `odd` gen_twopix() parameter game. So the first thing we are doing here is 1) moving gen_twopix() calls out of gen_text() into vivi_fillbuff(), to pregenerate black and white colors, just before printing starts. what we have next is that gen_text's font rendering loop, even with gen_twopix() calls moved out, was inefficient and branchy, so let's 2) rewrite gen_text() loop so it uses less variables + unroll char horizontal-rendering loop + instantiate 3 code paths for pixelsizes 2,3 and 4 so that in all inner loops we don't have to branch or make indirections (*). Done all above reworks, for gen_text() we get nice, non-branchy streamlined code (showing loop for pixelsize=2): ? cmp $0x2,%eax ? ? jne 26 ? mov -0x18(%ebp),%eax ? mov -0x20(%ebp),%edi ? imul -0x20(%ebp),%eax ? movzwl 0x3ffc(%ebx),%esi 0,08 ? movzwl 0x4000(%ebx),%ecx 0,04 ? add %edi,%edi ? mov 0x0,%ebx 0,51 ? mov %edi,-0x1c(%ebp) ? mov %ebx,-0x14(%ebp) ? movl $0x0,-0x10(%ebp) ? lea 0x20(%edx,%eax,2),%eax ? mov %eax,-0x18(%ebp) ? xchg %ax,%ax 0,04 ? a0: mov 0x8(%ebp),%ebx ? mov -0x18(%ebp),%eax 0,04 ? movzbl (%ebx),%edx 0,16 ? test %dl,%dl 0,04 ? ? je 128 0,08 ? lea 0x0(%esi),%esi 1,61 ? b0:???shl $0x4,%edx 1,02 ? ? mov -0x14(%ebp),%edi 2,04 ? ? add -0x10(%ebp),%edx 2,24 ? ? lea 0x1(%ebx),%ebx 0,27 ? ? movzbl (%edi,%edx,1),%edx 9,92 ? ? mov %esi,%edi 0,39 ? ? test %dl,%dl 2,04 ? ? cmovns %ecx,%edi 4,63 ? ? test $0x40,%dl 0,55 ? ? mov %di,(%eax) 3,76 ? ? mov %esi,%edi 0,71 ? ? cmove %ecx,%edi 3,41 ? ? test $0x20,%dl 0,75 ? ? mov %di,0x2(%eax) 2,43 ? ? mov %esi,%edi 0,59 ? ? cmove %ecx,%edi 4,59 ? ? test $0x10,%dl 0,67 ? ? mov %di,0x4(%eax) 2,55 ? ? mov %esi,%edi 0,78 ? ? cmove %ecx,%edi 4,31 ? ? test $0x8,%dl 0,67 ? ? mov %di,0x6(%eax) 5,76 ? ? mov %esi,%edi 1,80 ? ? cmove %ecx,%edi 4,20 ? ? test $0x4,%dl 0,86 ? ? mov %di,0x8(%eax) 2,98 ? ? mov %esi,%edi 1,37 ? ? cmove %ecx,%edi 4,67 ? ? test $0x2,%dl 0,20 ? ? mov %di,0xa(%eax) 2,78 ? ? mov %esi,%edi 0,75 ? ? cmove %ecx,%edi 3,92 ? ? and $0x1,%edx 0,75 ? ? mov %esi,%edx 2,59 ? ? mov %di,0xc(%eax) 0,59 ? ? cmove %ecx,%edx 3,10 ? ? mov %dx,0xe(%eax) 2,39 ? ? add $0x10,%eax 0,51 ? ? movzbl (%ebx),%edx 2,86 ? ? test %dl,%dl 2,31 ? ???jne b0 0,04 ?128: addl $0x1,-0x10(%ebp) 4,00 ? mov -0x1c(%ebp),%eax 0,04 ? add %eax,-0x18(%ebp) 0,08 ? cmpl $0x10,-0x10(%ebp) ? ? jne a0 which almost goes away from the profile: # cmdline : /home/kirr/local/perf/bin/perf record -g -a sleep 20 # Samples: 49K of event 'cycles' # Event count (approx.): 16799780016 # # Overhead Command Shared Object Symbol # ........ ............... .................... # 27.51% rawv libc-2.13.so [.] __memcpy_ssse3 23.77% vivi-* [kernel.kallsyms] [k] memcpy 9.96% Xorg [unknown] [.] 0xa76f5e12 4.94% vivi-* [vivi] [k] gen_text.constprop.6 4.44% rawv [vivi] [k] gen_twopix 3.17% vivi-* [vivi] [k] vivi_fillbuff 2.45% rawv [vivi] [k] precalculate_line 1.20% swapper [kernel.kallsyms] [k] read_hpet i.e. gen_twopix() overhead dropped from 49% to 4% and gen_text() loops from ~8% to ~4%, and overal cycles count dropped from 31551930117 to 16799780016 which is ~1.9x whole workload speedup. (*) for RGB24 rendering I've introduced x24, which could be thought as synthetic u24 for simplifying the code. That's done because for memcpy used for conditional assignment, gcc generates suboptimal code with more indirections. Fortunately, in C struct assignment is builtin and that's all we need from pixeltype for font rendering. Signed-off-by: Kirill Smelkov <kirr@mns.spb.ru> Acked-by: Hans Verkuil <hans.verkuil@cisco.com> Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
2012-11-02 12:10:30 +00:00
#define PRINTSTR(PIXTYPE) do { \
PIXTYPE fg; \
PIXTYPE bg; \
memcpy(&fg, &dev->textfg, sizeof(PIXTYPE)); \
memcpy(&bg, &dev->textbg, sizeof(PIXTYPE)); \
\
for (line = 0; line < 16; line++) { \
PIXTYPE *pos = (PIXTYPE *)( basep + ((y + line) * width + x) * sizeof(PIXTYPE) ); \
u8 *s; \
\
for (s = text; *s; s++) { \
u8 chr = font8x16[*s * 16 + line]; \
\
pos[0] = (chr & (0x01 << 7) ? fg : bg); \
pos[1] = (chr & (0x01 << 6) ? fg : bg); \
pos[2] = (chr & (0x01 << 5) ? fg : bg); \
pos[3] = (chr & (0x01 << 4) ? fg : bg); \
pos[4] = (chr & (0x01 << 3) ? fg : bg); \
pos[5] = (chr & (0x01 << 2) ? fg : bg); \
pos[6] = (chr & (0x01 << 1) ? fg : bg); \
pos[7] = (chr & (0x01 << 0) ? fg : bg); \
\
pos += 8; \
} \
} \
} while (0)
switch (dev->pixelsize) {
case 2:
PRINTSTR(u16); break;
case 4:
PRINTSTR(u32); break;
case 3:
PRINTSTR(x24); break;
}
}
static void vivi_fillbuff(struct vivi_dev *dev, struct vivi_buffer *buf)
{
[media] vivi: Move computations out of vivi_fillbuf linecopy loop The "dev->mvcount % wmax" thing was showing high in profiles (we do it for each line which ~ 500 per frame) ? 000010c0 <vivi_fillbuff>: ... 0,39 ? 70:???mov 0x3ff4(%edi),%esi 0,22 ? 76:? mov 0x2a0(%edi),%eax 0,30 ? ? mov -0x84(%ebp),%ebx 0,35 ? ? mov %eax,%edx 0,04 ? ? mov -0x7c(%ebp),%ecx 0,35 ? ? sar $0x1f,%edx 0,44 ? ? idivl -0x7c(%ebp) 21,68 ? ? imul %esi,%ecx 0,70 ? ? imul %esi,%ebx 0,52 ? ? add -0x88(%ebp),%ebx 1,65 ? ? mov %ebx,%eax 0,22 ? ? imul %edx,%esi 0,04 ? ? lea 0x3f4(%edi,%esi,1),%edx 2,18 ? ?? call vivi_fillbuff+0xa6 0,74 ? ? addl $0x1,-0x80(%ebp) 62,69 ? ? mov -0x7c(%ebp),%edx 1,18 ? ? mov -0x80(%ebp),%ecx 0,35 ? ? add %edx,-0x84(%ebp) 0,61 ? ? cmp %ecx,-0x8c(%ebp) 0,22 ? ???jne 70 so since all variables stay the same for all iterations let's move computations out of the loop: the abovementioned division and "width*pixelsize" too before: # cmdline : /home/kirr/local/perf/bin/perf record -g -a sleep 20 # # Samples: 49K of event 'cycles' # Event count (approx.): 16475832370 # # Overhead Command Shared Object # ........ ............... ...................... # 29.07% rawv libc-2.13.so [.] __memcpy_ssse3 20.57% vivi-* [kernel.kallsyms] [k] memcpy 10.20% Xorg [unknown] [.] 0xa7301494 5.16% vivi-* [vivi] [k] gen_text.constprop.6 4.43% rawv [vivi] [k] gen_twopix 4.36% vivi-* [vivi] [k] vivi_fillbuff 2.42% rawv [vivi] [k] precalculate_line 1.33% swapper [kernel.kallsyms] [k] read_hpet after: # cmdline : /home/kirr/local/perf/bin/perf record -g -a sleep 20 # # Samples: 46K of event 'cycles' # Event count (approx.): 15574200568 # # Overhead Command Shared Object # ........ ............... .................... # 27.99% rawv libc-2.13.so [.] __memcpy_ssse3 23.29% vivi-* [kernel.kallsyms] [k] memcpy 10.30% Xorg [unknown] [.] 0xa75c98f8 5.34% vivi-* [vivi] [k] gen_text.constprop.6 4.61% rawv [vivi] [k] gen_twopix 2.64% rawv [vivi] [k] precalculate_line 1.37% swapper [kernel.kallsyms] [k] read_hpet 0.79% Xorg [kernel.kallsyms] [k] read_hpet 0.64% Xorg [kernel.kallsyms] [k] unix_poll 0.45% Xorg [kernel.kallsyms] [k] fget_light 0.43% rawv libxcb.so.1.1.0 [.] 0x0000aae9 0.40% runsv [kernel.kallsyms] [k] ext2_try_to_allocate 0.36% Xorg [kernel.kallsyms] [k] _raw_spin_lock_irqsave 0.31% vivi-* [vivi] [k] vivi_fillbuff (i.e. vivi_fillbuff own overhead is almost gone) Signed-off-by: Kirill Smelkov <kirr@mns.spb.ru> Acked-by: Hans Verkuil <hans.verkuil@cisco.com> Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
2012-11-02 12:10:32 +00:00
int stride = dev->width * dev->pixelsize;
int hmax = dev->height;
void *vbuf = vb2_plane_vaddr(&buf->vb, 0);
unsigned ms;
char str[100];
int h, line = 1;
[media] vivi: Move computations out of vivi_fillbuf linecopy loop The "dev->mvcount % wmax" thing was showing high in profiles (we do it for each line which ~ 500 per frame) ? 000010c0 <vivi_fillbuff>: ... 0,39 ? 70:???mov 0x3ff4(%edi),%esi 0,22 ? 76:? mov 0x2a0(%edi),%eax 0,30 ? ? mov -0x84(%ebp),%ebx 0,35 ? ? mov %eax,%edx 0,04 ? ? mov -0x7c(%ebp),%ecx 0,35 ? ? sar $0x1f,%edx 0,44 ? ? idivl -0x7c(%ebp) 21,68 ? ? imul %esi,%ecx 0,70 ? ? imul %esi,%ebx 0,52 ? ? add -0x88(%ebp),%ebx 1,65 ? ? mov %ebx,%eax 0,22 ? ? imul %edx,%esi 0,04 ? ? lea 0x3f4(%edi,%esi,1),%edx 2,18 ? ?? call vivi_fillbuff+0xa6 0,74 ? ? addl $0x1,-0x80(%ebp) 62,69 ? ? mov -0x7c(%ebp),%edx 1,18 ? ? mov -0x80(%ebp),%ecx 0,35 ? ? add %edx,-0x84(%ebp) 0,61 ? ? cmp %ecx,-0x8c(%ebp) 0,22 ? ???jne 70 so since all variables stay the same for all iterations let's move computations out of the loop: the abovementioned division and "width*pixelsize" too before: # cmdline : /home/kirr/local/perf/bin/perf record -g -a sleep 20 # # Samples: 49K of event 'cycles' # Event count (approx.): 16475832370 # # Overhead Command Shared Object # ........ ............... ...................... # 29.07% rawv libc-2.13.so [.] __memcpy_ssse3 20.57% vivi-* [kernel.kallsyms] [k] memcpy 10.20% Xorg [unknown] [.] 0xa7301494 5.16% vivi-* [vivi] [k] gen_text.constprop.6 4.43% rawv [vivi] [k] gen_twopix 4.36% vivi-* [vivi] [k] vivi_fillbuff 2.42% rawv [vivi] [k] precalculate_line 1.33% swapper [kernel.kallsyms] [k] read_hpet after: # cmdline : /home/kirr/local/perf/bin/perf record -g -a sleep 20 # # Samples: 46K of event 'cycles' # Event count (approx.): 15574200568 # # Overhead Command Shared Object # ........ ............... .................... # 27.99% rawv libc-2.13.so [.] __memcpy_ssse3 23.29% vivi-* [kernel.kallsyms] [k] memcpy 10.30% Xorg [unknown] [.] 0xa75c98f8 5.34% vivi-* [vivi] [k] gen_text.constprop.6 4.61% rawv [vivi] [k] gen_twopix 2.64% rawv [vivi] [k] precalculate_line 1.37% swapper [kernel.kallsyms] [k] read_hpet 0.79% Xorg [kernel.kallsyms] [k] read_hpet 0.64% Xorg [kernel.kallsyms] [k] unix_poll 0.45% Xorg [kernel.kallsyms] [k] fget_light 0.43% rawv libxcb.so.1.1.0 [.] 0x0000aae9 0.40% runsv [kernel.kallsyms] [k] ext2_try_to_allocate 0.36% Xorg [kernel.kallsyms] [k] _raw_spin_lock_irqsave 0.31% vivi-* [vivi] [k] vivi_fillbuff (i.e. vivi_fillbuff own overhead is almost gone) Signed-off-by: Kirill Smelkov <kirr@mns.spb.ru> Acked-by: Hans Verkuil <hans.verkuil@cisco.com> Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
2012-11-02 12:10:32 +00:00
u8 *linestart;
s32 gain;
if (!vbuf)
return;
[media] vivi: Move computations out of vivi_fillbuf linecopy loop The "dev->mvcount % wmax" thing was showing high in profiles (we do it for each line which ~ 500 per frame) ? 000010c0 <vivi_fillbuff>: ... 0,39 ? 70:???mov 0x3ff4(%edi),%esi 0,22 ? 76:? mov 0x2a0(%edi),%eax 0,30 ? ? mov -0x84(%ebp),%ebx 0,35 ? ? mov %eax,%edx 0,04 ? ? mov -0x7c(%ebp),%ecx 0,35 ? ? sar $0x1f,%edx 0,44 ? ? idivl -0x7c(%ebp) 21,68 ? ? imul %esi,%ecx 0,70 ? ? imul %esi,%ebx 0,52 ? ? add -0x88(%ebp),%ebx 1,65 ? ? mov %ebx,%eax 0,22 ? ? imul %edx,%esi 0,04 ? ? lea 0x3f4(%edi,%esi,1),%edx 2,18 ? ?? call vivi_fillbuff+0xa6 0,74 ? ? addl $0x1,-0x80(%ebp) 62,69 ? ? mov -0x7c(%ebp),%edx 1,18 ? ? mov -0x80(%ebp),%ecx 0,35 ? ? add %edx,-0x84(%ebp) 0,61 ? ? cmp %ecx,-0x8c(%ebp) 0,22 ? ???jne 70 so since all variables stay the same for all iterations let's move computations out of the loop: the abovementioned division and "width*pixelsize" too before: # cmdline : /home/kirr/local/perf/bin/perf record -g -a sleep 20 # # Samples: 49K of event 'cycles' # Event count (approx.): 16475832370 # # Overhead Command Shared Object # ........ ............... ...................... # 29.07% rawv libc-2.13.so [.] __memcpy_ssse3 20.57% vivi-* [kernel.kallsyms] [k] memcpy 10.20% Xorg [unknown] [.] 0xa7301494 5.16% vivi-* [vivi] [k] gen_text.constprop.6 4.43% rawv [vivi] [k] gen_twopix 4.36% vivi-* [vivi] [k] vivi_fillbuff 2.42% rawv [vivi] [k] precalculate_line 1.33% swapper [kernel.kallsyms] [k] read_hpet after: # cmdline : /home/kirr/local/perf/bin/perf record -g -a sleep 20 # # Samples: 46K of event 'cycles' # Event count (approx.): 15574200568 # # Overhead Command Shared Object # ........ ............... .................... # 27.99% rawv libc-2.13.so [.] __memcpy_ssse3 23.29% vivi-* [kernel.kallsyms] [k] memcpy 10.30% Xorg [unknown] [.] 0xa75c98f8 5.34% vivi-* [vivi] [k] gen_text.constprop.6 4.61% rawv [vivi] [k] gen_twopix 2.64% rawv [vivi] [k] precalculate_line 1.37% swapper [kernel.kallsyms] [k] read_hpet 0.79% Xorg [kernel.kallsyms] [k] read_hpet 0.64% Xorg [kernel.kallsyms] [k] unix_poll 0.45% Xorg [kernel.kallsyms] [k] fget_light 0.43% rawv libxcb.so.1.1.0 [.] 0x0000aae9 0.40% runsv [kernel.kallsyms] [k] ext2_try_to_allocate 0.36% Xorg [kernel.kallsyms] [k] _raw_spin_lock_irqsave 0.31% vivi-* [vivi] [k] vivi_fillbuff (i.e. vivi_fillbuff own overhead is almost gone) Signed-off-by: Kirill Smelkov <kirr@mns.spb.ru> Acked-by: Hans Verkuil <hans.verkuil@cisco.com> Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
2012-11-02 12:10:32 +00:00
linestart = dev->line + (dev->mv_count % dev->width) * dev->pixelsize;
for (h = 0; h < hmax; h++)
[media] vivi: Move computations out of vivi_fillbuf linecopy loop The "dev->mvcount % wmax" thing was showing high in profiles (we do it for each line which ~ 500 per frame) ? 000010c0 <vivi_fillbuff>: ... 0,39 ? 70:???mov 0x3ff4(%edi),%esi 0,22 ? 76:? mov 0x2a0(%edi),%eax 0,30 ? ? mov -0x84(%ebp),%ebx 0,35 ? ? mov %eax,%edx 0,04 ? ? mov -0x7c(%ebp),%ecx 0,35 ? ? sar $0x1f,%edx 0,44 ? ? idivl -0x7c(%ebp) 21,68 ? ? imul %esi,%ecx 0,70 ? ? imul %esi,%ebx 0,52 ? ? add -0x88(%ebp),%ebx 1,65 ? ? mov %ebx,%eax 0,22 ? ? imul %edx,%esi 0,04 ? ? lea 0x3f4(%edi,%esi,1),%edx 2,18 ? ?? call vivi_fillbuff+0xa6 0,74 ? ? addl $0x1,-0x80(%ebp) 62,69 ? ? mov -0x7c(%ebp),%edx 1,18 ? ? mov -0x80(%ebp),%ecx 0,35 ? ? add %edx,-0x84(%ebp) 0,61 ? ? cmp %ecx,-0x8c(%ebp) 0,22 ? ???jne 70 so since all variables stay the same for all iterations let's move computations out of the loop: the abovementioned division and "width*pixelsize" too before: # cmdline : /home/kirr/local/perf/bin/perf record -g -a sleep 20 # # Samples: 49K of event 'cycles' # Event count (approx.): 16475832370 # # Overhead Command Shared Object # ........ ............... ...................... # 29.07% rawv libc-2.13.so [.] __memcpy_ssse3 20.57% vivi-* [kernel.kallsyms] [k] memcpy 10.20% Xorg [unknown] [.] 0xa7301494 5.16% vivi-* [vivi] [k] gen_text.constprop.6 4.43% rawv [vivi] [k] gen_twopix 4.36% vivi-* [vivi] [k] vivi_fillbuff 2.42% rawv [vivi] [k] precalculate_line 1.33% swapper [kernel.kallsyms] [k] read_hpet after: # cmdline : /home/kirr/local/perf/bin/perf record -g -a sleep 20 # # Samples: 46K of event 'cycles' # Event count (approx.): 15574200568 # # Overhead Command Shared Object # ........ ............... .................... # 27.99% rawv libc-2.13.so [.] __memcpy_ssse3 23.29% vivi-* [kernel.kallsyms] [k] memcpy 10.30% Xorg [unknown] [.] 0xa75c98f8 5.34% vivi-* [vivi] [k] gen_text.constprop.6 4.61% rawv [vivi] [k] gen_twopix 2.64% rawv [vivi] [k] precalculate_line 1.37% swapper [kernel.kallsyms] [k] read_hpet 0.79% Xorg [kernel.kallsyms] [k] read_hpet 0.64% Xorg [kernel.kallsyms] [k] unix_poll 0.45% Xorg [kernel.kallsyms] [k] fget_light 0.43% rawv libxcb.so.1.1.0 [.] 0x0000aae9 0.40% runsv [kernel.kallsyms] [k] ext2_try_to_allocate 0.36% Xorg [kernel.kallsyms] [k] _raw_spin_lock_irqsave 0.31% vivi-* [vivi] [k] vivi_fillbuff (i.e. vivi_fillbuff own overhead is almost gone) Signed-off-by: Kirill Smelkov <kirr@mns.spb.ru> Acked-by: Hans Verkuil <hans.verkuil@cisco.com> Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
2012-11-02 12:10:32 +00:00
memcpy(vbuf + h * stride, linestart, stride);
/* Updates stream time */
[media] vivi: Optimize gen_text() I've noticed that vivi takes a lot of CPU to produce its frames. For example for 8 devices and 8 simple programs running, where each captures YUY2 640x480 and displays it to X via SDL, profile timing is as follows: # cmdline : /home/kirr/local/perf/bin/perf record -g -a sleep 20 # Samples: 82K of event 'cycles' # Event count (approx.): 31551930117 # # Overhead Command Shared Object Symbol # ........ ............... .................... # 49.48% vivi-* [vivi] [k] gen_twopix 10.79% vivi-* [kernel.kallsyms] [k] memcpy 10.02% rawv libc-2.13.so [.] __memcpy_ssse3 8.35% vivi-* [vivi] [k] gen_text.constprop.6 5.06% Xorg [unknown] [.] 0xa73015f8 2.32% rawv [vivi] [k] gen_twopix 1.22% rawv [vivi] [k] precalculate_line 1.20% vivi-* [vivi] [k] vivi_fillbuff (rawv is display program, vivi-* is a combination of vivi-000 through vivi-007) so a lot of time is spent in gen_twopix() which as the follwing call-graph profile shows ... 49.48% vivi-* [vivi] [k] gen_twopix | --- gen_twopix | |--96.30%-- gen_text.constprop.6 | vivi_fillbuff | vivi_thread | kthread | ret_from_kernel_thread | --3.70%-- vivi_fillbuff vivi_thread kthread ret_from_kernel_thread ... is called mostly from gen_text(). If we'll look at gen_text(), in the inner loop, we'll see if (chr & (1 << (7 - i))) gen_twopix(dev, pos + j * dev->pixelsize, WHITE, (x+y) & 1); else gen_twopix(dev, pos + j * dev->pixelsize, TEXT_BLACK, (x+y) & 1); which calls gen_twopix() for every character pixel, and that is very expensive, because gen_twopix() branches several times. Now, let's note, that we operate on only two colors - WHITE and TEXT_BLACK, and that pixel for that colors could be precomputed and gen_twopix() moved out of the inner loop. Also note, that for black and white colors even/odd does not make a difference for all supported pixel formats, so we could stop doing that `odd` gen_twopix() parameter game. So the first thing we are doing here is 1) moving gen_twopix() calls out of gen_text() into vivi_fillbuff(), to pregenerate black and white colors, just before printing starts. what we have next is that gen_text's font rendering loop, even with gen_twopix() calls moved out, was inefficient and branchy, so let's 2) rewrite gen_text() loop so it uses less variables + unroll char horizontal-rendering loop + instantiate 3 code paths for pixelsizes 2,3 and 4 so that in all inner loops we don't have to branch or make indirections (*). Done all above reworks, for gen_text() we get nice, non-branchy streamlined code (showing loop for pixelsize=2): ? cmp $0x2,%eax ? ? jne 26 ? mov -0x18(%ebp),%eax ? mov -0x20(%ebp),%edi ? imul -0x20(%ebp),%eax ? movzwl 0x3ffc(%ebx),%esi 0,08 ? movzwl 0x4000(%ebx),%ecx 0,04 ? add %edi,%edi ? mov 0x0,%ebx 0,51 ? mov %edi,-0x1c(%ebp) ? mov %ebx,-0x14(%ebp) ? movl $0x0,-0x10(%ebp) ? lea 0x20(%edx,%eax,2),%eax ? mov %eax,-0x18(%ebp) ? xchg %ax,%ax 0,04 ? a0: mov 0x8(%ebp),%ebx ? mov -0x18(%ebp),%eax 0,04 ? movzbl (%ebx),%edx 0,16 ? test %dl,%dl 0,04 ? ? je 128 0,08 ? lea 0x0(%esi),%esi 1,61 ? b0:???shl $0x4,%edx 1,02 ? ? mov -0x14(%ebp),%edi 2,04 ? ? add -0x10(%ebp),%edx 2,24 ? ? lea 0x1(%ebx),%ebx 0,27 ? ? movzbl (%edi,%edx,1),%edx 9,92 ? ? mov %esi,%edi 0,39 ? ? test %dl,%dl 2,04 ? ? cmovns %ecx,%edi 4,63 ? ? test $0x40,%dl 0,55 ? ? mov %di,(%eax) 3,76 ? ? mov %esi,%edi 0,71 ? ? cmove %ecx,%edi 3,41 ? ? test $0x20,%dl 0,75 ? ? mov %di,0x2(%eax) 2,43 ? ? mov %esi,%edi 0,59 ? ? cmove %ecx,%edi 4,59 ? ? test $0x10,%dl 0,67 ? ? mov %di,0x4(%eax) 2,55 ? ? mov %esi,%edi 0,78 ? ? cmove %ecx,%edi 4,31 ? ? test $0x8,%dl 0,67 ? ? mov %di,0x6(%eax) 5,76 ? ? mov %esi,%edi 1,80 ? ? cmove %ecx,%edi 4,20 ? ? test $0x4,%dl 0,86 ? ? mov %di,0x8(%eax) 2,98 ? ? mov %esi,%edi 1,37 ? ? cmove %ecx,%edi 4,67 ? ? test $0x2,%dl 0,20 ? ? mov %di,0xa(%eax) 2,78 ? ? mov %esi,%edi 0,75 ? ? cmove %ecx,%edi 3,92 ? ? and $0x1,%edx 0,75 ? ? mov %esi,%edx 2,59 ? ? mov %di,0xc(%eax) 0,59 ? ? cmove %ecx,%edx 3,10 ? ? mov %dx,0xe(%eax) 2,39 ? ? add $0x10,%eax 0,51 ? ? movzbl (%ebx),%edx 2,86 ? ? test %dl,%dl 2,31 ? ???jne b0 0,04 ?128: addl $0x1,-0x10(%ebp) 4,00 ? mov -0x1c(%ebp),%eax 0,04 ? add %eax,-0x18(%ebp) 0,08 ? cmpl $0x10,-0x10(%ebp) ? ? jne a0 which almost goes away from the profile: # cmdline : /home/kirr/local/perf/bin/perf record -g -a sleep 20 # Samples: 49K of event 'cycles' # Event count (approx.): 16799780016 # # Overhead Command Shared Object Symbol # ........ ............... .................... # 27.51% rawv libc-2.13.so [.] __memcpy_ssse3 23.77% vivi-* [kernel.kallsyms] [k] memcpy 9.96% Xorg [unknown] [.] 0xa76f5e12 4.94% vivi-* [vivi] [k] gen_text.constprop.6 4.44% rawv [vivi] [k] gen_twopix 3.17% vivi-* [vivi] [k] vivi_fillbuff 2.45% rawv [vivi] [k] precalculate_line 1.20% swapper [kernel.kallsyms] [k] read_hpet i.e. gen_twopix() overhead dropped from 49% to 4% and gen_text() loops from ~8% to ~4%, and overal cycles count dropped from 31551930117 to 16799780016 which is ~1.9x whole workload speedup. (*) for RGB24 rendering I've introduced x24, which could be thought as synthetic u24 for simplifying the code. That's done because for memcpy used for conditional assignment, gcc generates suboptimal code with more indirections. Fortunately, in C struct assignment is builtin and that's all we need from pixeltype for font rendering. Signed-off-by: Kirill Smelkov <kirr@mns.spb.ru> Acked-by: Hans Verkuil <hans.verkuil@cisco.com> Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
2012-11-02 12:10:30 +00:00
gen_twopix(dev, (u8 *)&dev->textbg, TEXT_BLACK, /*odd=*/ 0);
gen_twopix(dev, (u8 *)&dev->textfg, WHITE, /*odd=*/ 0);
dev->ms += jiffies_to_msecs(jiffies - dev->jiffies);
dev->jiffies = jiffies;
ms = dev->ms;
snprintf(str, sizeof(str), " %02d:%02d:%02d:%03d ",
(ms / (60 * 60 * 1000)) % 24,
(ms / (60 * 1000)) % 60,
(ms / 1000) % 60,
ms % 1000);
gen_text(dev, vbuf, line++ * 16, 16, str);
snprintf(str, sizeof(str), " %dx%d, input %d ",
dev->width, dev->height, dev->input);
gen_text(dev, vbuf, line++ * 16, 16, str);
gain = v4l2_ctrl_g_ctrl(dev->gain);
mutex_lock(dev->ctrl_handler.lock);
snprintf(str, sizeof(str), " brightness %3d, contrast %3d, saturation %3d, hue %d ",
dev->brightness->cur.val,
dev->contrast->cur.val,
dev->saturation->cur.val,
dev->hue->cur.val);
gen_text(dev, vbuf, line++ * 16, 16, str);
snprintf(str, sizeof(str), " autogain %d, gain %3d, volume %3d, alpha 0x%02x ",
dev->autogain->cur.val, gain, dev->volume->cur.val,
dev->alpha->cur.val);
gen_text(dev, vbuf, line++ * 16, 16, str);
snprintf(str, sizeof(str), " int32 %d, int64 %lld, bitmask %08x ",
dev->int32->cur.val,
dev->int64->cur.val64,
dev->bitmask->cur.val);
gen_text(dev, vbuf, line++ * 16, 16, str);
snprintf(str, sizeof(str), " boolean %d, menu %s, string \"%s\" ",
dev->boolean->cur.val,
dev->menu->qmenu[dev->menu->cur.val],
dev->string->cur.string);
gen_text(dev, vbuf, line++ * 16, 16, str);
snprintf(str, sizeof(str), " integer_menu %lld, value %d ",
dev->int_menu->qmenu_int[dev->int_menu->cur.val],
dev->int_menu->cur.val);
gen_text(dev, vbuf, line++ * 16, 16, str);
mutex_unlock(dev->ctrl_handler.lock);
if (dev->button_pressed) {
dev->button_pressed--;
snprintf(str, sizeof(str), " button pressed!");
gen_text(dev, vbuf, line++ * 16, 16, str);
}
dev->mv_count += 2;
buf->vb.v4l2_buf.field = V4L2_FIELD_INTERLACED;
buf->vb.v4l2_buf.sequence = dev->seq_count++;
v4l2_get_timestamp(&buf->vb.v4l2_buf.timestamp);
}
static void vivi_thread_tick(struct vivi_dev *dev)
{
struct vivi_dmaqueue *dma_q = &dev->vidq;
struct vivi_buffer *buf;
unsigned long flags = 0;
dprintk(dev, 1, "Thread tick\n");
spin_lock_irqsave(&dev->slock, flags);
if (list_empty(&dma_q->active)) {
dprintk(dev, 1, "No active queue to serve\n");
[media] vivi: Fix sleep-in-atomic-context Fix sleep-in-atomic-context bug in vivi: Jun 28 18:14:39 tschai kernel: [ 80.970478] BUG: sleeping function called from invalid context at kernel/mutex.c:271 Jun 28 18:14:39 tschai kernel: [ 80.970483] in_atomic(): 0, irqs_disabled(): 1, pid: 2854, name: vivi-000 Jun 28 18:14:39 tschai kernel: [ 80.970485] INFO: lockdep is turned off. Jun 28 18:14:39 tschai kernel: [ 80.970486] irq event stamp: 0 Jun 28 18:14:39 tschai kernel: [ 80.970487] hardirqs last enabled at (0): [< (null)>] (null) Jun 28 18:14:39 tschai kernel: [ 80.970490] hardirqs last disabled at (0): [<ffffffff8109a90b>] copy_process+0x61b/0x1440 Jun 28 18:14:39 tschai kernel: [ 80.970495] softirqs last enabled at (0): [<ffffffff8109a90b>] copy_process+0x61b/0x1440 Jun 28 18:14:39 tschai kernel: [ 80.970498] softirqs last disabled at (0): [< (null)>] (null) Jun 28 18:14:39 tschai kernel: [ 80.970502] Pid: 2854, comm: vivi-000 Tainted: P 3.0.0-rc1-tschai #372 Jun 28 18:14:39 tschai kernel: [ 80.970504] Call Trace: Jun 28 18:14:39 tschai kernel: [ 80.970509] [<ffffffff81089be3>] __might_sleep+0xf3/0x130 Jun 28 18:14:39 tschai kernel: [ 80.970512] [<ffffffff8176967f>] mutex_lock_nested+0x2f/0x60 Jun 28 18:14:39 tschai kernel: [ 80.970517] [<ffffffffa0acee3e>] vivi_fillbuff+0x20e/0x3f0 [vivi] Jun 28 18:14:39 tschai kernel: [ 80.970520] [<ffffffff81407004>] ? do_raw_spin_lock+0x54/0x150 Jun 28 18:14:39 tschai kernel: [ 80.970524] [<ffffffff8104ef5e>] ? read_tsc+0xe/0x20 Jun 28 18:14:39 tschai kernel: [ 80.970528] [<ffffffff810c9d87>] ? getnstimeofday+0x57/0xe0 Jun 28 18:14:39 tschai kernel: [ 80.970531] [<ffffffffa0acf1b1>] vivi_thread+0x191/0x2f0 [vivi] Jun 28 18:14:39 tschai kernel: [ 80.970534] [<ffffffff81093aa0>] ? try_to_wake_up+0x2d0/0x2d0 Jun 28 18:14:39 tschai kernel: [ 80.970537] [<ffffffffa0acf020>] ? vivi_fillbuff+0x3f0/0x3f0 [vivi] Jun 28 18:14:39 tschai kernel: [ 80.970541] [<ffffffff810bff46>] kthread+0xb6/0xc0 Jun 28 18:14:39 tschai kernel: [ 80.970544] [<ffffffff817743e4>] kernel_thread_helper+0x4/0x10 Jun 28 18:14:39 tschai kernel: [ 80.970547] [<ffffffff8176b4d4>] ? retint_restore_args+0x13/0x13 Jun 28 18:14:39 tschai kernel: [ 80.970550] [<ffffffff810bfe90>] ? __init_kthread_worker+0x70/0x70 Jun 28 18:14:39 tschai kernel: [ 80.970552] [<ffffffff817743e0>] ? gs_change+0x13/0x13 This bug was introduced in 2.6.39. Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com> Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
2011-07-05 10:19:23 +00:00
spin_unlock_irqrestore(&dev->slock, flags);
return;
}
buf = list_entry(dma_q->active.next, struct vivi_buffer, list);
list_del(&buf->list);
[media] vivi: Fix sleep-in-atomic-context Fix sleep-in-atomic-context bug in vivi: Jun 28 18:14:39 tschai kernel: [ 80.970478] BUG: sleeping function called from invalid context at kernel/mutex.c:271 Jun 28 18:14:39 tschai kernel: [ 80.970483] in_atomic(): 0, irqs_disabled(): 1, pid: 2854, name: vivi-000 Jun 28 18:14:39 tschai kernel: [ 80.970485] INFO: lockdep is turned off. Jun 28 18:14:39 tschai kernel: [ 80.970486] irq event stamp: 0 Jun 28 18:14:39 tschai kernel: [ 80.970487] hardirqs last enabled at (0): [< (null)>] (null) Jun 28 18:14:39 tschai kernel: [ 80.970490] hardirqs last disabled at (0): [<ffffffff8109a90b>] copy_process+0x61b/0x1440 Jun 28 18:14:39 tschai kernel: [ 80.970495] softirqs last enabled at (0): [<ffffffff8109a90b>] copy_process+0x61b/0x1440 Jun 28 18:14:39 tschai kernel: [ 80.970498] softirqs last disabled at (0): [< (null)>] (null) Jun 28 18:14:39 tschai kernel: [ 80.970502] Pid: 2854, comm: vivi-000 Tainted: P 3.0.0-rc1-tschai #372 Jun 28 18:14:39 tschai kernel: [ 80.970504] Call Trace: Jun 28 18:14:39 tschai kernel: [ 80.970509] [<ffffffff81089be3>] __might_sleep+0xf3/0x130 Jun 28 18:14:39 tschai kernel: [ 80.970512] [<ffffffff8176967f>] mutex_lock_nested+0x2f/0x60 Jun 28 18:14:39 tschai kernel: [ 80.970517] [<ffffffffa0acee3e>] vivi_fillbuff+0x20e/0x3f0 [vivi] Jun 28 18:14:39 tschai kernel: [ 80.970520] [<ffffffff81407004>] ? do_raw_spin_lock+0x54/0x150 Jun 28 18:14:39 tschai kernel: [ 80.970524] [<ffffffff8104ef5e>] ? read_tsc+0xe/0x20 Jun 28 18:14:39 tschai kernel: [ 80.970528] [<ffffffff810c9d87>] ? getnstimeofday+0x57/0xe0 Jun 28 18:14:39 tschai kernel: [ 80.970531] [<ffffffffa0acf1b1>] vivi_thread+0x191/0x2f0 [vivi] Jun 28 18:14:39 tschai kernel: [ 80.970534] [<ffffffff81093aa0>] ? try_to_wake_up+0x2d0/0x2d0 Jun 28 18:14:39 tschai kernel: [ 80.970537] [<ffffffffa0acf020>] ? vivi_fillbuff+0x3f0/0x3f0 [vivi] Jun 28 18:14:39 tschai kernel: [ 80.970541] [<ffffffff810bff46>] kthread+0xb6/0xc0 Jun 28 18:14:39 tschai kernel: [ 80.970544] [<ffffffff817743e4>] kernel_thread_helper+0x4/0x10 Jun 28 18:14:39 tschai kernel: [ 80.970547] [<ffffffff8176b4d4>] ? retint_restore_args+0x13/0x13 Jun 28 18:14:39 tschai kernel: [ 80.970550] [<ffffffff810bfe90>] ? __init_kthread_worker+0x70/0x70 Jun 28 18:14:39 tschai kernel: [ 80.970552] [<ffffffff817743e0>] ? gs_change+0x13/0x13 This bug was introduced in 2.6.39. Signed-off-by: Hans Verkuil <hans.verkuil@cisco.com> Signed-off-by: Mauro Carvalho Chehab <mchehab@redhat.com>
2011-07-05 10:19:23 +00:00
spin_unlock_irqrestore(&dev->slock, flags);
v4l2_get_timestamp(&buf->vb.v4l2_buf.timestamp);
/* Fill buffer */
vivi_fillbuff(dev, buf);
dprintk(dev, 1, "filled buffer %p\n", buf);
vb2_buffer_done(&buf->vb, VB2_BUF_STATE_DONE);
dprintk(dev, 2, "[%p/%d] done\n", buf, buf->vb.v4l2_buf.index);
}
#define frames_to_ms(dev, frames) \
((frames * dev->timeperframe.numerator * 1000) / dev->timeperframe.denominator)
static void vivi_sleep(struct vivi_dev *dev)
{
struct vivi_dmaqueue *dma_q = &dev->vidq;
int timeout;
DECLARE_WAITQUEUE(wait, current);
dprintk(dev, 1, "%s dma_q=0x%08lx\n", __func__,
(unsigned long)dma_q);
add_wait_queue(&dma_q->wq, &wait);
if (kthread_should_stop())
goto stop_task;
/* Calculate time to wake up */
timeout = msecs_to_jiffies(frames_to_ms(dev, 1));
vivi_thread_tick(dev);
schedule_timeout_interruptible(timeout);
stop_task:
remove_wait_queue(&dma_q->wq, &wait);
try_to_freeze();
}
static int vivi_thread(void *data)
{
struct vivi_dev *dev = data;
dprintk(dev, 1, "thread started\n");
set_freezable();
for (;;) {
vivi_sleep(dev);
if (kthread_should_stop())
break;
}
dprintk(dev, 1, "thread: exit\n");
return 0;
}
static int vivi_start_generating(struct vivi_dev *dev)
{
struct vivi_dmaqueue *dma_q = &dev->vidq;
dprintk(dev, 1, "%s\n", __func__);
/* Resets frame counters */
dev->ms = 0;
dev->mv_count = 0;
dev->jiffies = jiffies;
dma_q->frame = 0;
dma_q->ini_jiffies = jiffies;
dma_q->kthread = kthread_run(vivi_thread, dev, "%s",
dev->v4l2_dev.name);
if (IS_ERR(dma_q->kthread)) {
v4l2_err(&dev->v4l2_dev, "kernel_thread() failed\n");
return PTR_ERR(dma_q->kthread);
}
/* Wakes thread */
wake_up_interruptible(&dma_q->wq);
dprintk(dev, 1, "returning from %s\n", __func__);
return 0;
}
static void vivi_stop_generating(struct vivi_dev *dev)
{
struct vivi_dmaqueue *dma_q = &dev->vidq;
dprintk(dev, 1, "%s\n", __func__);
/* shutdown control thread */
if (dma_q->kthread) {
kthread_stop(dma_q->kthread);
dma_q->kthread = NULL;
}
/*
* Typical driver might need to wait here until dma engine stops.
* In this case we can abort imiedetly, so it's just a noop.
*/
/* Release all active buffers */
while (!list_empty(&dma_q->active)) {
struct vivi_buffer *buf;
buf = list_entry(dma_q->active.next, struct vivi_buffer, list);
list_del(&buf->list);
vb2_buffer_done(&buf->vb, VB2_BUF_STATE_ERROR);
dprintk(dev, 2, "[%p/%d] done\n", buf, buf->vb.v4l2_buf.index);
}
}
/* ------------------------------------------------------------------
Videobuf operations
------------------------------------------------------------------*/
static int queue_setup(struct vb2_queue *vq, const struct v4l2_format *fmt,
unsigned int *nbuffers, unsigned int *nplanes,
unsigned int sizes[], void *alloc_ctxs[])
{
struct vivi_dev *dev = vb2_get_drv_priv(vq);
unsigned long size;
size = dev->width * dev->height * dev->pixelsize;
if (fmt) {
if (fmt->fmt.pix.sizeimage < size)
return -EINVAL;
size = fmt->fmt.pix.sizeimage;
/* check against insane over 8K resolution buffers */
if (size > 7680 * 4320 * dev->pixelsize)
return -EINVAL;
}
*nplanes = 1;
sizes[0] = size;
/*
* videobuf2-vmalloc allocator is context-less so no need to set
* alloc_ctxs array.
*/
dprintk(dev, 1, "%s, count=%d, size=%ld\n", __func__,
*nbuffers, size);
return 0;
}
static int buffer_prepare(struct vb2_buffer *vb)
{
struct vivi_dev *dev = vb2_get_drv_priv(vb->vb2_queue);
struct vivi_buffer *buf = container_of(vb, struct vivi_buffer, vb);
unsigned long size;
dprintk(dev, 1, "%s, field=%d\n", __func__, vb->v4l2_buf.field);
BUG_ON(NULL == dev->fmt);
/*
* Theses properties only change when queue is idle, see s_fmt.
* The below checks should not be performed here, on each
* buffer_prepare (i.e. on each qbuf). Most of the code in this function
* should thus be moved to buffer_init and s_fmt.
*/
if (dev->width < 48 || dev->width > MAX_WIDTH ||
dev->height < 32 || dev->height > MAX_HEIGHT)
return -EINVAL;
size = dev->width * dev->height * dev->pixelsize;
if (vb2_plane_size(vb, 0) < size) {
dprintk(dev, 1, "%s data will not fit into plane (%lu < %lu)\n",
__func__, vb2_plane_size(vb, 0), size);
return -EINVAL;
}
vb2_set_plane_payload(&buf->vb, 0, size);
precalculate_bars(dev);
precalculate_line(dev);
return 0;
}
static void buffer_queue(struct vb2_buffer *vb)
{
struct vivi_dev *dev = vb2_get_drv_priv(vb->vb2_queue);
struct vivi_buffer *buf = container_of(vb, struct vivi_buffer, vb);
struct vivi_dmaqueue *vidq = &dev->vidq;
unsigned long flags = 0;
dprintk(dev, 1, "%s\n", __func__);
spin_lock_irqsave(&dev->slock, flags);
list_add_tail(&buf->list, &vidq->active);
spin_unlock_irqrestore(&dev->slock, flags);
}
static int start_streaming(struct vb2_queue *vq, unsigned int count)
{
struct vivi_dev *dev = vb2_get_drv_priv(vq);
int err;
dprintk(dev, 1, "%s\n", __func__);
dev->seq_count = 0;
err = vivi_start_generating(dev);
if (err) {
struct vivi_buffer *buf, *tmp;
list_for_each_entry_safe(buf, tmp, &dev->vidq.active, list) {
list_del(&buf->list);
vb2_buffer_done(&buf->vb, VB2_BUF_STATE_QUEUED);
}
}
return err;
}
/* abort streaming and wait for last buffer */
static int stop_streaming(struct vb2_queue *vq)
{
struct vivi_dev *dev = vb2_get_drv_priv(vq);
dprintk(dev, 1, "%s\n", __func__);
vivi_stop_generating(dev);
return 0;
}
static void vivi_lock(struct vb2_queue *vq)
{
struct vivi_dev *dev = vb2_get_drv_priv(vq);
mutex_lock(&dev->mutex);
}
static void vivi_unlock(struct vb2_queue *vq)
{
struct vivi_dev *dev = vb2_get_drv_priv(vq);
mutex_unlock(&dev->mutex);
}
static const struct vb2_ops vivi_video_qops = {
.queue_setup = queue_setup,
.buf_prepare = buffer_prepare,
.buf_queue = buffer_queue,
.start_streaming = start_streaming,
.stop_streaming = stop_streaming,
.wait_prepare = vivi_unlock,
.wait_finish = vivi_lock,
};
/* ------------------------------------------------------------------
IOCTL vidioc handling
------------------------------------------------------------------*/
static int vidioc_querycap(struct file *file, void *priv,
struct v4l2_capability *cap)
{
struct vivi_dev *dev = video_drvdata(file);
strcpy(cap->driver, "vivi");
strcpy(cap->card, "vivi");
snprintf(cap->bus_info, sizeof(cap->bus_info),
"platform:%s", dev->v4l2_dev.name);
cap->device_caps = V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_STREAMING |
V4L2_CAP_READWRITE;
cap->capabilities = cap->device_caps | V4L2_CAP_DEVICE_CAPS;
return 0;
}
static int vidioc_enum_fmt_vid_cap(struct file *file, void *priv,
struct v4l2_fmtdesc *f)
{
const struct vivi_fmt *fmt;
if (f->index >= ARRAY_SIZE(formats))
return -EINVAL;
fmt = &formats[f->index];
strlcpy(f->description, fmt->name, sizeof(f->description));
f->pixelformat = fmt->fourcc;
return 0;
}
static int vidioc_g_fmt_vid_cap(struct file *file, void *priv,
struct v4l2_format *f)
{
struct vivi_dev *dev = video_drvdata(file);
f->fmt.pix.width = dev->width;
f->fmt.pix.height = dev->height;
f->fmt.pix.field = V4L2_FIELD_INTERLACED;
f->fmt.pix.pixelformat = dev->fmt->fourcc;
f->fmt.pix.bytesperline =
(f->fmt.pix.width * dev->fmt->depth) >> 3;
f->fmt.pix.sizeimage =
f->fmt.pix.height * f->fmt.pix.bytesperline;
if (dev->fmt->is_yuv)
f->fmt.pix.colorspace = V4L2_COLORSPACE_SMPTE170M;
else
f->fmt.pix.colorspace = V4L2_COLORSPACE_SRGB;
return 0;
}
static int vidioc_try_fmt_vid_cap(struct file *file, void *priv,
struct v4l2_format *f)
{
struct vivi_dev *dev = video_drvdata(file);
const struct vivi_fmt *fmt;
fmt = get_format(f);
if (!fmt) {
dprintk(dev, 1, "Fourcc format (0x%08x) unknown.\n",
f->fmt.pix.pixelformat);
f->fmt.pix.pixelformat = V4L2_PIX_FMT_YUYV;
fmt = get_format(f);
}
f->fmt.pix.field = V4L2_FIELD_INTERLACED;
v4l_bound_align_image(&f->fmt.pix.width, 48, MAX_WIDTH, 2,
&f->fmt.pix.height, 32, MAX_HEIGHT, 0, 0);
f->fmt.pix.bytesperline =
(f->fmt.pix.width * fmt->depth) >> 3;
f->fmt.pix.sizeimage =
f->fmt.pix.height * f->fmt.pix.bytesperline;
if (fmt->is_yuv)
f->fmt.pix.colorspace = V4L2_COLORSPACE_SMPTE170M;
else
f->fmt.pix.colorspace = V4L2_COLORSPACE_SRGB;
f->fmt.pix.priv = 0;
return 0;
}
static int vidioc_s_fmt_vid_cap(struct file *file, void *priv,
struct v4l2_format *f)
{
struct vivi_dev *dev = video_drvdata(file);
struct vb2_queue *q = &dev->vb_vidq;
int ret = vidioc_try_fmt_vid_cap(file, priv, f);
if (ret < 0)
return ret;
if (vb2_is_busy(q)) {
dprintk(dev, 1, "%s device busy\n", __func__);
return -EBUSY;
}
dev->fmt = get_format(f);
dev->pixelsize = dev->fmt->depth / 8;
dev->width = f->fmt.pix.width;
dev->height = f->fmt.pix.height;
return 0;
}
static int vidioc_enum_framesizes(struct file *file, void *fh,
struct v4l2_frmsizeenum *fsize)
{
static const struct v4l2_frmsize_stepwise sizes = {
48, MAX_WIDTH, 4,
32, MAX_HEIGHT, 1
};
int i;
if (fsize->index)
return -EINVAL;
for (i = 0; i < ARRAY_SIZE(formats); i++)
if (formats[i].fourcc == fsize->pixel_format)
break;
if (i == ARRAY_SIZE(formats))
return -EINVAL;
fsize->type = V4L2_FRMSIZE_TYPE_STEPWISE;
fsize->stepwise = sizes;
return 0;
}
/* only one input in this sample driver */
static int vidioc_enum_input(struct file *file, void *priv,
struct v4l2_input *inp)
{
if (inp->index >= NUM_INPUTS)
return -EINVAL;
inp->type = V4L2_INPUT_TYPE_CAMERA;
sprintf(inp->name, "Camera %u", inp->index);
return 0;
}
static int vidioc_g_input(struct file *file, void *priv, unsigned int *i)
{
struct vivi_dev *dev = video_drvdata(file);
*i = dev->input;
return 0;
}
static int vidioc_s_input(struct file *file, void *priv, unsigned int i)
{
struct vivi_dev *dev = video_drvdata(file);
if (i >= NUM_INPUTS)
return -EINVAL;
if (i == dev->input)
return 0;
dev->input = i;
/*
* Modify the brightness range depending on the input.
* This makes it easy to use vivi to test if applications can
* handle control range modifications and is also how this is
* typically used in practice as different inputs may be hooked
* up to different receivers with different control ranges.
*/
v4l2_ctrl_modify_range(dev->brightness,
128 * i, 255 + 128 * i, 1, 127 + 128 * i);
precalculate_bars(dev);
precalculate_line(dev);
return 0;
}
/* timeperframe is arbitrary and continuous */
static int vidioc_enum_frameintervals(struct file *file, void *priv,
struct v4l2_frmivalenum *fival)
{
const struct vivi_fmt *fmt;
if (fival->index)
return -EINVAL;
fmt = __get_format(fival->pixel_format);
if (!fmt)
return -EINVAL;
/* check for valid width/height */
if (fival->width < 48 || fival->width > MAX_WIDTH || (fival->width & 3))
return -EINVAL;
if (fival->height < 32 || fival->height > MAX_HEIGHT)
return -EINVAL;
fival->type = V4L2_FRMIVAL_TYPE_CONTINUOUS;
/* fill in stepwise (step=1.0 is required by V4L2 spec) */
fival->stepwise.min = tpf_min;
fival->stepwise.max = tpf_max;
fival->stepwise.step = (struct v4l2_fract) {1, 1};
return 0;
}
static int vidioc_g_parm(struct file *file, void *priv,
struct v4l2_streamparm *parm)
{
struct vivi_dev *dev = video_drvdata(file);
if (parm->type != V4L2_BUF_TYPE_VIDEO_CAPTURE)
return -EINVAL;
parm->parm.capture.capability = V4L2_CAP_TIMEPERFRAME;
parm->parm.capture.timeperframe = dev->timeperframe;
parm->parm.capture.readbuffers = 1;
return 0;
}
#define FRACT_CMP(a, OP, b) \
((u64)(a).numerator * (b).denominator OP (u64)(b).numerator * (a).denominator)
static int vidioc_s_parm(struct file *file, void *priv,
struct v4l2_streamparm *parm)
{
struct vivi_dev *dev = video_drvdata(file);
struct v4l2_fract tpf;
if (parm->type != V4L2_BUF_TYPE_VIDEO_CAPTURE)
return -EINVAL;
tpf = parm->parm.capture.timeperframe;
/* tpf: {*, 0} resets timing; clip to [min, max]*/
tpf = tpf.denominator ? tpf : tpf_default;
tpf = FRACT_CMP(tpf, <, tpf_min) ? tpf_min : tpf;
tpf = FRACT_CMP(tpf, >, tpf_max) ? tpf_max : tpf;
dev->timeperframe = tpf;
parm->parm.capture.timeperframe = tpf;
parm->parm.capture.readbuffers = 1;
return 0;
}
/* --- controls ---------------------------------------------- */
static int vivi_g_volatile_ctrl(struct v4l2_ctrl *ctrl)
{
struct vivi_dev *dev = container_of(ctrl->handler, struct vivi_dev, ctrl_handler);
if (ctrl == dev->autogain)
dev->gain->val = jiffies & 0xff;
return 0;
}
static int vivi_s_ctrl(struct v4l2_ctrl *ctrl)
{
struct vivi_dev *dev = container_of(ctrl->handler, struct vivi_dev, ctrl_handler);
switch (ctrl->id) {
case V4L2_CID_ALPHA_COMPONENT:
dev->alpha_component = ctrl->val;
break;
default:
if (ctrl == dev->button)
dev->button_pressed = 30;
break;
}
return 0;
}
/* ------------------------------------------------------------------
File operations for the device
------------------------------------------------------------------*/
static const struct v4l2_ctrl_ops vivi_ctrl_ops = {
.g_volatile_ctrl = vivi_g_volatile_ctrl,
.s_ctrl = vivi_s_ctrl,
};
#define VIVI_CID_CUSTOM_BASE (V4L2_CID_USER_BASE | 0xf000)
static const struct v4l2_ctrl_config vivi_ctrl_button = {
.ops = &vivi_ctrl_ops,
.id = VIVI_CID_CUSTOM_BASE + 0,
.name = "Button",
.type = V4L2_CTRL_TYPE_BUTTON,
};
static const struct v4l2_ctrl_config vivi_ctrl_boolean = {
.ops = &vivi_ctrl_ops,
.id = VIVI_CID_CUSTOM_BASE + 1,
.name = "Boolean",
.type = V4L2_CTRL_TYPE_BOOLEAN,
.min = 0,
.max = 1,
.step = 1,
.def = 1,
};
static const struct v4l2_ctrl_config vivi_ctrl_int32 = {
.ops = &vivi_ctrl_ops,
.id = VIVI_CID_CUSTOM_BASE + 2,
.name = "Integer 32 Bits",
.type = V4L2_CTRL_TYPE_INTEGER,
.min = 0x80000000,
.max = 0x7fffffff,
.step = 1,
};
static const struct v4l2_ctrl_config vivi_ctrl_int64 = {
.ops = &vivi_ctrl_ops,
.id = VIVI_CID_CUSTOM_BASE + 3,
.name = "Integer 64 Bits",
.type = V4L2_CTRL_TYPE_INTEGER64,
};
static const char * const vivi_ctrl_menu_strings[] = {
"Menu Item 0 (Skipped)",
"Menu Item 1",
"Menu Item 2 (Skipped)",
"Menu Item 3",
"Menu Item 4",
"Menu Item 5 (Skipped)",
NULL,
};
static const struct v4l2_ctrl_config vivi_ctrl_menu = {
.ops = &vivi_ctrl_ops,
.id = VIVI_CID_CUSTOM_BASE + 4,
.name = "Menu",
.type = V4L2_CTRL_TYPE_MENU,
.min = 1,
.max = 4,
.def = 3,
.menu_skip_mask = 0x04,
.qmenu = vivi_ctrl_menu_strings,
};
static const struct v4l2_ctrl_config vivi_ctrl_string = {
.ops = &vivi_ctrl_ops,
.id = VIVI_CID_CUSTOM_BASE + 5,
.name = "String",
.type = V4L2_CTRL_TYPE_STRING,
.min = 2,
.max = 4,
.step = 1,
};
static const struct v4l2_ctrl_config vivi_ctrl_bitmask = {
.ops = &vivi_ctrl_ops,
.id = VIVI_CID_CUSTOM_BASE + 6,
.name = "Bitmask",
.type = V4L2_CTRL_TYPE_BITMASK,
.def = 0x80002000,
.min = 0,
.max = 0x80402010,
.step = 0,
};
static const s64 vivi_ctrl_int_menu_values[] = {
1, 1, 2, 3, 5, 8, 13, 21, 42,
};
static const struct v4l2_ctrl_config vivi_ctrl_int_menu = {
.ops = &vivi_ctrl_ops,
.id = VIVI_CID_CUSTOM_BASE + 7,
.name = "Integer menu",
.type = V4L2_CTRL_TYPE_INTEGER_MENU,
.min = 1,
.max = 8,
.def = 4,
.menu_skip_mask = 0x02,
.qmenu_int = vivi_ctrl_int_menu_values,
};
static const struct v4l2_file_operations vivi_fops = {
.owner = THIS_MODULE,
.open = v4l2_fh_open,
.release = vb2_fop_release,
.read = vb2_fop_read,
.poll = vb2_fop_poll,
.unlocked_ioctl = video_ioctl2, /* V4L2 ioctl handler */
.mmap = vb2_fop_mmap,
};
static const struct v4l2_ioctl_ops vivi_ioctl_ops = {
.vidioc_querycap = vidioc_querycap,
.vidioc_enum_fmt_vid_cap = vidioc_enum_fmt_vid_cap,
.vidioc_g_fmt_vid_cap = vidioc_g_fmt_vid_cap,
.vidioc_try_fmt_vid_cap = vidioc_try_fmt_vid_cap,
.vidioc_s_fmt_vid_cap = vidioc_s_fmt_vid_cap,
.vidioc_enum_framesizes = vidioc_enum_framesizes,
.vidioc_reqbufs = vb2_ioctl_reqbufs,
.vidioc_create_bufs = vb2_ioctl_create_bufs,
.vidioc_prepare_buf = vb2_ioctl_prepare_buf,
.vidioc_querybuf = vb2_ioctl_querybuf,
.vidioc_qbuf = vb2_ioctl_qbuf,
.vidioc_dqbuf = vb2_ioctl_dqbuf,
.vidioc_enum_input = vidioc_enum_input,
.vidioc_g_input = vidioc_g_input,
.vidioc_s_input = vidioc_s_input,
.vidioc_enum_frameintervals = vidioc_enum_frameintervals,
.vidioc_g_parm = vidioc_g_parm,
.vidioc_s_parm = vidioc_s_parm,
.vidioc_streamon = vb2_ioctl_streamon,
.vidioc_streamoff = vb2_ioctl_streamoff,
.vidioc_log_status = v4l2_ctrl_log_status,
.vidioc_subscribe_event = v4l2_ctrl_subscribe_event,
.vidioc_unsubscribe_event = v4l2_event_unsubscribe,
};
static const struct video_device vivi_template = {
.name = "vivi",
.fops = &vivi_fops,
.ioctl_ops = &vivi_ioctl_ops,
.release = video_device_release_empty,
};
/* -----------------------------------------------------------------
Initialization and module stuff
------------------------------------------------------------------*/
static int vivi_release(void)
{
struct vivi_dev *dev;
struct list_head *list;
while (!list_empty(&vivi_devlist)) {
list = vivi_devlist.next;
list_del(list);
dev = list_entry(list, struct vivi_dev, vivi_devlist);
v4l2_info(&dev->v4l2_dev, "unregistering %s\n",
video_device_node_name(&dev->vdev));
video_unregister_device(&dev->vdev);
v4l2_device_unregister(&dev->v4l2_dev);
v4l2_ctrl_handler_free(&dev->ctrl_handler);
kfree(dev);
}
return 0;
}
static int __init vivi_create_instance(int inst)
{
struct vivi_dev *dev;
struct video_device *vfd;
struct v4l2_ctrl_handler *hdl;
struct vb2_queue *q;
int ret;
dev = kzalloc(sizeof(*dev), GFP_KERNEL);
if (!dev)
return -ENOMEM;
snprintf(dev->v4l2_dev.name, sizeof(dev->v4l2_dev.name),
"%s-%03d", VIVI_MODULE_NAME, inst);
ret = v4l2_device_register(NULL, &dev->v4l2_dev);
if (ret)
goto free_dev;
dev->fmt = &formats[0];
dev->timeperframe = tpf_default;
dev->width = 640;
dev->height = 480;
dev->pixelsize = dev->fmt->depth / 8;
hdl = &dev->ctrl_handler;
v4l2_ctrl_handler_init(hdl, 11);
dev->volume = v4l2_ctrl_new_std(hdl, &vivi_ctrl_ops,
V4L2_CID_AUDIO_VOLUME, 0, 255, 1, 200);
dev->brightness = v4l2_ctrl_new_std(hdl, &vivi_ctrl_ops,
V4L2_CID_BRIGHTNESS, 0, 255, 1, 127);
dev->contrast = v4l2_ctrl_new_std(hdl, &vivi_ctrl_ops,
V4L2_CID_CONTRAST, 0, 255, 1, 16);
dev->saturation = v4l2_ctrl_new_std(hdl, &vivi_ctrl_ops,
V4L2_CID_SATURATION, 0, 255, 1, 127);
dev->hue = v4l2_ctrl_new_std(hdl, &vivi_ctrl_ops,
V4L2_CID_HUE, -128, 127, 1, 0);
dev->autogain = v4l2_ctrl_new_std(hdl, &vivi_ctrl_ops,
V4L2_CID_AUTOGAIN, 0, 1, 1, 1);
dev->gain = v4l2_ctrl_new_std(hdl, &vivi_ctrl_ops,
V4L2_CID_GAIN, 0, 255, 1, 100);
dev->alpha = v4l2_ctrl_new_std(hdl, &vivi_ctrl_ops,
V4L2_CID_ALPHA_COMPONENT, 0, 255, 1, 0);
dev->button = v4l2_ctrl_new_custom(hdl, &vivi_ctrl_button, NULL);
dev->int32 = v4l2_ctrl_new_custom(hdl, &vivi_ctrl_int32, NULL);
dev->int64 = v4l2_ctrl_new_custom(hdl, &vivi_ctrl_int64, NULL);
dev->boolean = v4l2_ctrl_new_custom(hdl, &vivi_ctrl_boolean, NULL);
dev->menu = v4l2_ctrl_new_custom(hdl, &vivi_ctrl_menu, NULL);
dev->string = v4l2_ctrl_new_custom(hdl, &vivi_ctrl_string, NULL);
dev->bitmask = v4l2_ctrl_new_custom(hdl, &vivi_ctrl_bitmask, NULL);
dev->int_menu = v4l2_ctrl_new_custom(hdl, &vivi_ctrl_int_menu, NULL);
if (hdl->error) {
ret = hdl->error;
goto unreg_dev;
}
v4l2_ctrl_auto_cluster(2, &dev->autogain, 0, true);
dev->v4l2_dev.ctrl_handler = hdl;
/* initialize locks */
spin_lock_init(&dev->slock);
/* initialize queue */
q = &dev->vb_vidq;
q->type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
q->io_modes = VB2_MMAP | VB2_USERPTR | VB2_DMABUF | VB2_READ;
q->drv_priv = dev;
q->buf_struct_size = sizeof(struct vivi_buffer);
q->ops = &vivi_video_qops;
q->mem_ops = &vb2_vmalloc_memops;
q->timestamp_flags = V4L2_BUF_FLAG_TIMESTAMP_MONOTONIC;
ret = vb2_queue_init(q);
if (ret)
goto unreg_dev;
mutex_init(&dev->mutex);
/* init video dma queues */
INIT_LIST_HEAD(&dev->vidq.active);
init_waitqueue_head(&dev->vidq.wq);
vfd = &dev->vdev;
*vfd = vivi_template;
vfd->debug = debug;
vfd->v4l2_dev = &dev->v4l2_dev;
vfd->queue = q;
set_bit(V4L2_FL_USE_FH_PRIO, &vfd->flags);
/*
* Provide a mutex to v4l2 core. It will be used to protect
* all fops and v4l2 ioctls.
*/
vfd->lock = &dev->mutex;
video_set_drvdata(vfd, dev);
ret = video_register_device(vfd, VFL_TYPE_GRABBER, video_nr);
if (ret < 0)
goto unreg_dev;
/* Now that everything is fine, let's add it to device list */
list_add_tail(&dev->vivi_devlist, &vivi_devlist);
v4l2_info(&dev->v4l2_dev, "V4L2 device registered as %s\n",
video_device_node_name(vfd));
return 0;
unreg_dev:
v4l2_ctrl_handler_free(hdl);
v4l2_device_unregister(&dev->v4l2_dev);
free_dev:
kfree(dev);
return ret;
}
/* This routine allocates from 1 to n_devs virtual drivers.
The real maximum number of virtual drivers will depend on how many drivers
will succeed. This is limited to the maximum number of devices that
videodev supports, which is equal to VIDEO_NUM_DEVICES.
*/
static int __init vivi_init(void)
{
const struct font_desc *font = find_font("VGA8x16");
int ret = 0, i;
if (font == NULL) {
printk(KERN_ERR "vivi: could not find font\n");
return -ENODEV;
}
font8x16 = font->data;
if (n_devs <= 0)
n_devs = 1;
for (i = 0; i < n_devs; i++) {
ret = vivi_create_instance(i);
if (ret) {
/* If some instantiations succeeded, keep driver */
if (i)
ret = 0;
break;
}
}
if (ret < 0) {
printk(KERN_ERR "vivi: error %d while loading driver\n", ret);
return ret;
}
printk(KERN_INFO "Video Technology Magazine Virtual Video "
"Capture Board ver %s successfully loaded.\n",
VIVI_VERSION);
/* n_devs will reflect the actual number of allocated devices */
n_devs = i;
return ret;
}
static void __exit vivi_exit(void)
{
vivi_release();
}
module_init(vivi_init);
module_exit(vivi_exit);