Splitting rows into bins - ntile vs width_bucket

原创 fizz 2022-11-19

107

Statement1

NTILE sorts the data according to its ORDER BY, then generates N groups. Each group contains the same number of rows. If ( row count / N ) is not an integer, some groups will contain one extra row

Split into quartiles with NTILE

with rws as (  
  select dbms_random.value ( 0, 100 ) x  
  from   dual connect by level <= 1000  
), grps as (  
  select x,   
         ntile ( 4 ) over ( order by x ) grp  
  from   rws  
)  
  select grp, count (*) row#,   
         round ( min ( x ), 1 ) lower_bound,   
         round ( max ( x ), 1 ) upper_bound  
  from   grps  
  group  by grp order by grp

GRP	ROW#	LOWER_BOUND	UPPER_BOUND
1	250	.3	24.2
2	250	24.3	50.1
3	250	50.1	75.8
4	250	75.9	99.9

4 rows selected.

Statement2

As with the previous example, each group has 250 rows. The upper and lower bound for each group is significantly different though.

Split into quartiles with NTILE - normal distribution

with rws as (  
  select ( 10 * dbms_random.normal ) + 50 x  
  from   dual connect by level <= 1000  
), grps as (  
  select x,   
         ntile ( 4 ) over ( order by x ) grp  
  from   rws  
)  
  select grp, count(*),   
         round ( min ( x ), 1 ),   
         round ( max ( x ), 1 )  
  from   grps  
  group  by grp order  by grp

GRP	COUNT(*)	ROUND(MIN(X),1)	ROUND(MAX(X),1)
1	250	23.2	43.4
2	250	43.5	50.2
3	250	50.2	56.7
4	250	56.8	85

4 rows selected.

Statement3

WIDTH_BUCKET creates buckets of equal size. In this case each covers 25 values. This fixes the range of values in each bucket. The numbers are randomly generated using a uniform distribution. So there is roughly 250 rows in each group.

Split into four bins with WIDTH_BUCKET

with rws as (  
  select dbms_random.value ( 0, 100 ) x  
  from   dual connect by level <= 1000  
), grps as (  
  select x,   
         -- intervals [0, 25), [25, 50), [50, 75), [75, 100)  
         width_bucket ( x, 0, 100, 4 ) bucket  
  from   rws  
)  
  select bucket, count(*),   
         round ( min ( x ), 1 ),   
         round ( max ( x ), 1 )  
  from   grps  
  group  by bucket order by bucket

BUCKET	COUNT(*)	ROUND(MIN(X),1)	ROUND(MAX(X),1)
1	249	0	24.9
2	251	25	50
3	234	50.5	74.8
4	266	75	100

4 rows selected.

Statement4
As with the previous example, the upper and lower bound of each bin is fixed. But this time the data are generated using a normal distribution, so almost all the rows are in the middle two buckets.
Split into four bins with WIDTH_BUCKET - normal distribution
```
with rws as (  
  select ( 10 * dbms_random.normal ) + 50 x  
  from   dual connect by level <= 1000  
), grps as (  
  select x,   
         width_bucket ( x, 0, 100, 4 ) bucket  
  from   rws  
)  
  select bucket, count(*),   
         round ( min ( x ), 1 ),   
         round ( max ( x ), 1 )  
  from   grps  
  group  by bucket order by bucket
```
BUCKET COUNT(*) ROUND(MIN(X),1) ROUND(MAX(X),1)
1 3 9.8 21.5
2 490 25.6 50
3 502 50 74.4
4 5 75.3 86

4 rows selected.

BUCKET	COUNT(*)	ROUND(MIN(X),1)	ROUND(MAX(X),1)
1	3	9.8	21.5
2	490	25.6	50
3	502	50	74.4
4	5	75.3	86

Statement5

The previous WIDTH_BUCKET examples fixed the range to [0, 100). This dynamically sets the upper and lower bounds by using the MAX and MIN analytic functions.

Split into four bins with WIDTH_BUCKET with dynamic bounds

with rws as (  
  select ( 10 * dbms_random.normal ) + 50 x  
  from   dual connect by level <= 1000  
), grps as (  
  select x, width_bucket (   
           x,   
           min ( x ) over (),   
           max ( x ) over () + 1,   
           4   
         ) bucket  
  from   rws  
)  
  select bucket, count(*),   
         round ( min ( x ), 1 ),   
         round ( max ( x ), 1 ),  
         round ( max ( x ) - min ( x ), 1 ) bucket_range  
  from   grps  
  group  by bucket order by bucket

BUCKET	COUNT(*)	ROUND(MIN(X),1)	ROUND(MAX(X),1)	BUCKET_RANGE
1	79	21	36.5	15.6
2	504	36.8	52.2	15.4
3	397	52.2	67.7	15.5
4	20	68.2	82.4	14.3

4 rows selected.

Statement6

WIDTH_BUCKET only supports numeric and datetime values. So if you want to split strings into buckets, you first need to map the characters to numbers. This uses DENSE_RANK to map the starting letter to the values 1 - 26.

Split into four bins with WIDTH_BUCKET - character data

with rws as (  
  select initcap ( dbms_random.string ( 'u', 10 ) ) x  
  from   dual connect by level <= 1000   
), ranks as (  
  select x, substr ( x, 1, 1 ) letter,  
         dense_rank () over ( order by substr ( x, 1, 1 ) ) dr  
  from   rws  
), grps as (   
  select x, letter, width_bucket (   
           dr, 1, 27, 4   
         ) grp  
  from   ranks  
)  
  select min ( letter ), max ( letter ),  
         count ( distinct letter ) value#,  
         count (*) row#,  
         min ( x ), max ( x )   
  from   grps  
  group  by grp order by grp

MIN(LETTER)	MAX(LETTER)	VALUE#	ROW#	MIN(X)	MAX(X)
A	G	7	264	Abbkaisqob	Gzthagisaj
H	M	6	249	Haotkacnux	Mzqrivbrqz
N	T	7	276	Nauugxwzqt	Tzjwgcpztq
U	Z	6	211	Udnwtrnsgn	Zxxyobitlq

4 rows selected.

Statement7

This uses DENSE_RANK to assign each JOB_ID to a number [1, number of distinct jobs]. This enables you to place the rows for each job in alphabetical buckets.

Split into four bins with WIDTH_BUCKET - complete strings

with rws as (  
  select dense_rank () over ( order by job_id ) dr, e.*  
  from   hr.employees e  
), grps as (  
  select r.*,   
         width_bucket ( dr, 1, max ( dr ) over () + 1, 4 ) bucket  
  from   rws r  
)  
  select bucket,  
         count ( distinct job_id ) job_count,  
         count(*) row_count,   
         listagg ( distinct job_id, ',' )    
           within group ( order by job_id ) jobs  
  from   grps  
  group  by bucket

BUCKET	JOB_COUNT	ROW_COUNT	JOBS
1	5	6	AC_ACCOUNT,AC_MGR,AD_ASST,AD_PRES,AD_VP
2	5	13	FI_ACCOUNT,FI_MGR,HR_REP,IT_PROG,MK_MAN
3	5	13	MK_REP,PR_REP,PU_CLERK,PU_MAN,SA_MAN
4	4	75	SA_REP,SH_CLERK,ST_CLERK,ST_MAN

4 rows selected.

Statement8
Using DENSE_RANK to map letters to numbers can give unexpected results if some letters are missing in the source. In this case gaps mean W is mapped to 20, instead of its actual position in the English alphabet, 23. So names starting with W go in the third bin instead of the last. The query only returns three rows instead of the four buckets requested.
Split into four bins with WIDTH_BUCKET - missing letters
```
with rws as (  
  select first_name x  
  from   hr.employees  
), ranks as (  
  select x, substr ( x, 1, 1 ) letter,  
         dense_rank () over ( order by substr ( x, 1, 1 ) ) dr  
  from   rws  
), grps as (   
  select x, letter, width_bucket (   
           dr, 1, 27, 4   
         ) grp  
  from   ranks  
)  
  select min ( letter ), max ( letter ),  
         count ( distinct letter ) value#,  
         count (*) row#,  
         min ( x ), max ( x )   
  from   grps  
  group  by grp order by grp
```
MIN(LETTER) MAX(LETTER) VALUE# ROW# MIN(X) MAX(X)
A H 7 34 Adam Hermann
I N 6 41 Irene Neena
O W 7 32 Oliver Winston

3 rows selected.

MIN(LETTER)	MAX(LETTER)	VALUE#	ROW#	MIN(X)	MAX(X)
A	H	7	34	Adam	Hermann
I	N	6	41	Irene	Neena
O	W	7	32	Oliver	Winston

Statement9

You can solve the problem of missing letters by mapping them to ASCII character codes

Split letters into four groups with WIDTH_BUCKET

with rws as ( 
  select first_name x 
  from   hr.employees 
), ranks as ( 
  select x, substr ( x, 1, 1 ) letter, 
         ascii ( substr ( x, 1, 1 ) ) - 64 dr 
  from   rws 
), grps as (  
  select x, letter, width_bucket (  
           dr, 1, 27, 4  
         ) grp 
  from   ranks 
) 
  select min ( letter ), max ( letter ), 
         count ( distinct letter ) value#, 
         count (*) row#, 
         min ( x ), max ( x )  
  from   grps 
  group  by grp order by grp

MIN(LETTER)	MAX(LETTER)	VALUE#	ROW#	MIN(X)	MAX(X)
A	G	6	31	Adam	Guy
H	M	6	40	Harrison	Mozhe
N	T	6	31	Nancy	Trenna
V	W	2	5	Valli	Winston

4 rows selected.

Statement10
NTILE and WIDTH_BUCKET work with a fixed number of bins. If you have an unknown number of buckets of a fixed size, you can use this method instead. This generates a unique number for each row, from 1 to the number of rows. Then groups them in batches of five by taking the ceiling of the row number divided by 5.
Split into groups with five rows each
```
with rws as (  
  select dbms_random.value ( 0, 100 ) x, level id, 5 n  
  from   dual  
  connect by level <= dbms_random.value ( 100, 1000 )  
), grps as (  
  select x, n,  
         row_number () over ( order by id ) rn  
  from   rws  
)  
  select ceil ( rn / n ), count(*),  
         round ( min ( x ) ), round ( max ( x ) )  
  from   grps  
  group  by ceil ( rn / n ) order by ceil ( rn / n )
```
CEIL(RN/N) COUNT(*) ROUND(MIN(X)) ROUND(MAX(X))
1 5 13 75
2 5 29 88
3 5 5 85
4 5 25 84
5 5 20 93
6 5 7 87
7 5 4 91
8 5 0 58
9 5 21 93
10 5 34 100
11 5 4 96
12 5 1 92
13 5 16 62
14 5 1 69
15 5 9 86
16 5 18 81
17 5 32 86
18 5 30 83
19 5 47 90
20 5 9 82
21 4 5 92

21 rows selected.
Statement11
The previous example puts five rows in each group. This sets the group size to five values. So the intervals are [0, 5), [5, 10), [10, 15), etc. The principle is the same - normalize the values by dividing them by the group size. This takes the floor instead of the ceiling to start the first group at zero instead of one. It then multiplies back up by the interval size to get the upper and lower bounds for each group.
Split rows into fixed intervals
```
with rws as (  
  select dbms_random.value ( 0, 100 ) x,  
         5 interval_size  
  from   dual connect by level <= 1000  
), grps as (  
  select floor ( x / interval_size ) * interval_size x,  
         interval_size  
  from   rws  
)  
  select x, x + interval_size,  
         count(*)  
  from   grps  
  group  by x, interval_size  
  order  by x
```
X X+INTERVAL_SIZE COUNT(*)
0 5 60
5 10 56
10 15 49
15 20 42
20 25 58
25 30 47
30 35 64
35 40 48
40 45 44
45 50 50
50 55 42
55 60 58
60 65 49
65 70 59
70 75 48
75 80 50
80 85 49
85 90 34
90 95 48
95 100 45

20 rows selected.

CEIL(RN/N)	COUNT(*)	ROUND(MIN(X))	ROUND(MAX(X))
1	5	13	75
2	5	29	88
3	5	5	85
4	5	25	84
5	5	20	93
6	5	7	87
7	5	4	91
8	5	0	58
9	5	21	93
10	5	34	100
11	5	4	96
12	5	1	92
13	5	16	62
14	5	1	69
15	5	9	86
16	5	18	81
17	5	32	86
18	5	30	83
19	5	47	90
20	5	9	82
21	4	5	92

X	X+INTERVAL_SIZE	COUNT(*)
0	5	60
5	10	56
10	15	49
15	20	42
20	25	58
25	30	47
30	35	64
35	40	48
40	45	44
45	50	50
50	55	42
55	60	58
60	65	49
65	70	59
70	75	48
75	80	50
80	85	49
85	90	34
90	95	48
95	100	45

Statement12

alter session set nls_date_format = '  DD-MON HH24:MI  '

Statement processed.

Statement13

alter session set nls_timestamp_format = '  DD-MON HH24:MI  '

Statement processed.

Statement14

This uses the same concept as the previous two examples to split the rows into groups of five minutes. It does this by finding the number of minutes each after the start time for each row. Then uses the normalizing formula to turn these into groups of five minutes.

Split into five minute intervals

with rws as (  
  select timestamp'2021-09-01 00:00:00' +   
           numtodsinterval ( dbms_random.value ( 0, 120 ), 'minute' ) x,  
         5 interval_mins  
  from   dual connect by level <= 1000  
), grps as (  
  select x, interval_mins, trunc ( x ) dt,  
         floor (   
            -- convert to minutes past midnight  
            ( extract ( hour from x ) + 1 ) * extract ( minute from x )   
              / interval_mins   
         ) * interval_mins mins  
  from   rws  
)  
  select dt + mins/1440 st, dt + ( mins + interval_mins ) / 1440 en,   
         count (*)  
  from   grps  
  group  by dt + mins/1440, dt + ( mins + interval_mins ) / 1440 order by dt + mins/1440

ST	EN	COUNT(*)
01-SEP 00:00	01-SEP 00:05	71
01-SEP 00:05	01-SEP 00:10	67
01-SEP 00:10	01-SEP 00:15	55
01-SEP 00:15	01-SEP 00:20	48
01-SEP 00:20	01-SEP 00:25	65
01-SEP 00:25	01-SEP 00:30	58
01-SEP 00:30	01-SEP 00:35	79
01-SEP 00:35	01-SEP 00:40	51
01-SEP 00:40	01-SEP 00:45	64
01-SEP 00:45	01-SEP 00:50	67
01-SEP 00:50	01-SEP 00:55	63
01-SEP 00:55	01-SEP 01:00	60
01-SEP 01:00	01-SEP 01:05	24
01-SEP 01:05	01-SEP 01:10	20
01-SEP 01:10	01-SEP 01:15	28
01-SEP 01:15	01-SEP 01:20	17
01-SEP 01:20	01-SEP 01:25	30
01-SEP 01:25	01-SEP 01:30	14
01-SEP 01:30	01-SEP 01:35	22
01-SEP 01:35	01-SEP 01:40	15
01-SEP 01:40	01-SEP 01:45	29
01-SEP 01:45	01-SEP 01:50	21
01-SEP 01:50	01-SEP 01:55	17
01-SEP 01:55	01-SEP 02:00	15

24 rows selected.

Statement15

This is the same as the previous example, but there are some time intervals with no rows! So these are missing from the data set.

Split into five minute intervals - sparse data

with rws as (  
  select timestamp'2021-09-01 00:00:00' +   
           numtodsinterval ( dbms_random.value ( 0, 120 ), 'minute' ) x,  
         5 interval_mins  
  from   dual connect by level <= 10  
), grps as (  
  select x, interval_mins, trunc ( x ) dt,  
         floor (   
            -- convert to minutes past midnight  
            ( extract ( hour from x ) + 1 ) * extract ( minute from x )   
              / interval_mins   
         ) * interval_mins mins  
  from   rws  
)  
  select dt + mins/1440 st, dt + ( mins + interval_mins ) / 1440 en,   
         count (*)  
  from   grps  
  group  by dt + mins/1440, dt + ( mins + interval_mins ) / 1440 order by dt + mins/1440

ST	EN	COUNT(*)
01-SEP 00:10	01-SEP 00:15	2
01-SEP 00:15	01-SEP 00:20	1
01-SEP 00:35	01-SEP 00:40	2
01-SEP 00:40	01-SEP 00:45	2
01-SEP 00:45	01-SEP 00:50	1
01-SEP 00:50	01-SEP 00:55	1
01-SEP 00:55	01-SEP 01:00	1

7 rows selected.

Statement16

As with the previous example, this has sparse data. To ensure the output includes a row for each interval in the period, it first generates all the intervals. Then outer joins the sparse data to this.

Split into five minute intervals - sparse data

with intervals as (  
  select 5 interval_size,   
         timestamp'2021-09-01 00:00:00' lower_bound,   
         120 total_minutes   
  from   dual  
), grps as (  
  select lower_bound + numtodsinterval ( ( level - 1 ) * interval_size, 'minute' ) mn_x,  
         lower_bound + numtodsinterval ( level * interval_size, 'minute' ) mx_x,  
         interval_size  
  from   intervals  
  connect by level <= (   
    -- unknown data size; create a range for all possible values  
    select max ( ceil ( total_minutes / interval_size ) )   
    from   intervals  
  )  
), rws as (  
  select lower_bound +   
           numtodsinterval ( dbms_random.value ( 0, 120 ), 'minute' ) x  
  from   intervals connect by level <= 25  
)  
  select mn_x, mx_x,   
         count ( x ) rws_per_group   
  from   grps  
  left join rws  
  on     x >= mn_x  
  and    x < mx_x  
  group  by mn_x, mx_x order by mn_x

MN_X	MX_X	RWS_PER_GROUP
01-SEP 00:00	01-SEP 00:05	1
01-SEP 00:05	01-SEP 00:10	1
01-SEP 00:10	01-SEP 00:15	2
01-SEP 00:15	01-SEP 00:20	1
01-SEP 00:20	01-SEP 00:25	0
01-SEP 00:25	01-SEP 00:30	0
01-SEP 00:30	01-SEP 00:35	0
01-SEP 00:35	01-SEP 00:40	3
01-SEP 00:40	01-SEP 00:45	1
01-SEP 00:45	01-SEP 00:50	1
01-SEP 00:50	01-SEP 00:55	3
01-SEP 00:55	01-SEP 01:00	0
01-SEP 01:00	01-SEP 01:05	0
01-SEP 01:05	01-SEP 01:10	0
01-SEP 01:10	01-SEP 01:15	0
01-SEP 01:15	01-SEP 01:20	1
01-SEP 01:20	01-SEP 01:25	1
01-SEP 01:25	01-SEP 01:30	3
01-SEP 01:30	01-SEP 01:35	0
01-SEP 01:35	01-SEP 01:40	0
01-SEP 01:40	01-SEP 01:45	1
01-SEP 01:45	01-SEP 01:50	0
01-SEP 01:50	01-SEP 01:55	1
01-SEP 01:55	01-SEP 02:00	2

24 rows selected.

Statement17

You can use the previous method for sparse numeric data as well as datetime data. This generates intervals [0, 5), [5, 10), etc., then outer joins the sparse data to this.

Split into ranges of five - sparse data

with intervals as (  
  select 5 interval_size, 0 lower_bound, 100 upper_bound  
  from   dual  
), grps as (  
  select ( level - 1 ) * interval_size mn_x,  
         level * interval_size mx_x,  
         interval_size  
  from   intervals  
  connect by level <= (   
    -- unknown data size; create a range for all possible values  
    select max ( ceil ( upper_bound / interval_size ) ) from intervals  
  )  
), rws as (  
  select dbms_random.value ( lower_bound, upper_bound ) x  
  from   intervals connect by level <= 25  
)  
  select mn_x, mx_x,   
         count ( x ) rws_per_group   
  from   grps  
  left join rws  
  on     x >= mn_x  
  and    x < mx_x  
  group  by mn_x, mx_x order by mn_x

MN_X	MX_X	RWS_PER_GROUP
0	5	2
5	10	2
10	15	0
15	20	0
20	25	0
25	30	1
30	35	1
35	40	5
40	45	0
45	50	1
50	55	2
55	60	1
60	65	1
65	70	0
70	75	0
75	80	3
80	85	2
85	90	4
90	95	1
95	100	1

20 rows selected.

墨力计划

「喜欢这篇文章，您的关注和赞赏是给作者最好的鼓励」

关注作者

CEIL(RN/N)	COUNT(*)	ROUND(MIN(X))	ROUND(MAX(X))
1	5	13	75
2	5	29	88
3	5	5	85
4	5	25	84
5	5	20	93
6	5	7	87
7	5	4	91
8	5	0	58
9	5	21	93
10	5	34	100
11	5	4	96
12	5	1	92
13	5	16	62
14	5	1	69
15	5	9	86
16	5	18	81
17	5	32	86
18	5	30	83
19	5	47	90
20	5	9	82
21	4	5	92

X	X+INTERVAL_SIZE	COUNT(*)
0	5	60
5	10	56
10	15	49
15	20	42
20	25	58
25	30	47
30	35	64
35	40	48
40	45	44
45	50	50
50	55	42
55	60	58
60	65	49
65	70	59
70	75	48
75	80	50
80	85	49
85	90	34
90	95	48
95	100	45

MN_X	MX_X	RWS_PER_GROUP
0	5	2
5	10	2
10	15	0
15	20	0
20	25	0
25	30	1
30	35	1
35	40	5
40	45	0
45	50	1
50	55	2
55	60	1
60	65	1
65	70	0
70	75	0
75	80	3
80	85	2
85	90	4
90	95	1
95	100	1

CEIL(RN/N)	COUNT(*)	ROUND(MIN(X))	ROUND(MAX(X))
1	5	13	75
2	5	29	88
3	5	5	85
4	5	25	84
5	5	20	93
6	5	7	87
7	5	4	91
8	5	0	58
9	5	21	93
10	5	34	100
11	5	4	96
12	5	1	92
13	5	16	62
14	5	1	69
15	5	9	86
16	5	18	81
17	5	32	86
18	5	30	83
19	5	47	90
20	5	9	82
21	4	5	92

X	X+INTERVAL_SIZE	COUNT(*)
0	5	60
5	10	56
10	15	49
15	20	42
20	25	58
25	30	47
30	35	64
35	40	48
40	45	44
45	50	50
50	55	42
55	60	58
60	65	49
65	70	59
70	75	48
75	80	50
80	85	49
85	90	34
90	95	48
95	100	45

MN_X	MX_X	RWS_PER_GROUP
0	5	2
5	10	2
10	15	0
15	20	0
20	25	0
25	30	1
30	35	1
35	40	5
40	45	0
45	50	1
50	55	2
55	60	1
60	65	1
65	70	0
70	75	0
75	80	3
80	85	2
85	90	4
90	95	1
95	100	1

Splitting rows into bins - ntile vs width_bucket

评论

CEIL(RN/N)	COUNT(*)	ROUND(MIN(X))	ROUND(MAX(X))
1	5	13	75
2	5	29	88
3	5	5	85
4	5	25	84
5	5	20	93
6	5	7	87
7	5	4	91
8	5	0	58
9	5	21	93
10	5	34	100
11	5	4	96
12	5	1	92
13	5	16	62
14	5	1	69
15	5	9	86
16	5	18	81
17	5	32	86
18	5	30	83
19	5	47	90
20	5	9	82
21	4	5	92

X	X+INTERVAL_SIZE	COUNT(*)
0	5	60
5	10	56
10	15	49
15	20	42
20	25	58
25	30	47
30	35	64
35	40	48
40	45	44
45	50	50
50	55	42
55	60	58
60	65	49
65	70	59
70	75	48
75	80	50
80	85	49
85	90	34
90	95	48
95	100	45

MN_X	MX_X	RWS_PER_GROUP
0	5	2
5	10	2
10	15	0
15	20	0
20	25	0
25	30	1
30	35	1
35	40	5
40	45	0
45	50	1
50	55	2
55	60	1
60	65	1
65	70	0
70	75	0
75	80	3
80	85	2
85	90	4
90	95	1
95	100	1