-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfios2-iosched.c
3464 lines (2925 loc) · 87.6 KB
/
fios2-iosched.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/sched/clock.h>
#include <linux/blkdev.h>
#include <linux/elevator.h>
#include <linux/ktime.h>
#include <linux/rbtree.h>
#include <linux/ioprio.h>
#include <linux/blktrace_api.h>
#include <linux/blk-cgroup.h>
#include "blk.h"
#include "blk-wbt.h"
/*
* tunables
*/
static const int cfq_alpha = 50;///xx - default alpha=50(0.5), 0 <= alpha < 100
static const int cfq_tservice = NSEC_PER_SEC / 125;///xx - Tservice, default=8ms, calc for anticipation
static const int cfq_tsrv_nr = 1;///xx
static const u64 cfq_tsrv_sum = NSEC_PER_SEC / 125;///xx
static const u64 cfq_epoch_slice = NSEC_PER_SEC / 8;///xx - nummer of an epoch timeslices
/* max queue in one round of service */
static const int cfq_quantum = 8;
static const u64 cfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 };
static const u64 cfq_slice_sync = NSEC_PER_SEC / 10;
static u64 cfq_slice_async = NSEC_PER_SEC / 25;
static const int cfq_slice_async_rq = 2;
static u64 cfq_slice_idle = NSEC_PER_SEC / 125;
static u64 cfq_group_idle = NSEC_PER_SEC / 125;
static const u64 cfq_target_latency = (u64)NSEC_PER_SEC * 3/10; /* 300 ms */
static const int cfq_hist_divisor = 4;
/*
* offset from end of queue service tree for idle class
*/
#define CFQ_IDLE_DELAY (NSEC_PER_SEC / 5)
/* offset from end of group service tree under time slice mode */
#define CFQ_SLICE_MODE_GROUP_DELAY (NSEC_PER_SEC / 5)
/* offset from end of group service under IOPS mode */
#define CFQ_IOPS_MODE_GROUP_DELAY (HZ / 5)
/*
* below this threshold, we consider thinktime immediate
*/
#define CFQ_MIN_TT (2 * NSEC_PER_SEC / HZ)
#define CFQ_SLICE_SCALE (5)
#define CFQ_HW_QUEUE_MIN (5)
#define CFQ_SERVICE_SHIFT 12
#define CFQQ_SEEK_THR (sector_t)(8 * 100)
#define CFQQ_CLOSE_THR (sector_t)(8 * 1024)
#define CFQQ_SECT_THR_NONROT (sector_t)(2 * 32)
#define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8)
#define RQ_CIC(rq) icq_to_cic((rq)->elv.icq)
#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elv.priv[0])
#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elv.priv[1])
static struct kmem_cache *cfq_pool;
#define CFQ_PRIO_LISTS IOPRIO_BE_NR
#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
#define sample_valid(samples) ((samples) > 80)
#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node)
/* blkio-related constants */
#define CFQ_WEIGHT_LEGACY_MIN 10
#define CFQ_WEIGHT_LEGACY_DFL 500
#define CFQ_WEIGHT_LEGACY_MAX 1000
/*
* Most of our rbtree usage is for sorting with min extraction, so
* if we cache the leftmost node we don't have to walk down the tree
* to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
* move this into the elevator for the rq sorting as well.
*/
struct cfq_rb_root {
struct rb_root_cached rb;
struct rb_node *rb_rightmost;
unsigned count;
u64 min_vdisktime;
};
#define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT_CACHED, \
.rb_rightmost = NULL,}
/*
* Per process-grouping structure
*/
struct cfq_queue {
/* reference count */
int ref;
/* various state flags, see below */
unsigned int flags;
/* parent cfq_data */
struct cfq_data *cfqd;
/* service_tree member */
struct rb_node rb_node;
/* service_tree key */
u64 rb_key;
/* prio tree member */
struct rb_node p_node;
/* prio tree root we belong to, if any */
struct rb_root *p_root;
/* sorted list of pending requests */
struct rb_root sort_list;
/* if fifo isn't expired, next request to serve */
struct request *next_rq;
/* requests queued in sort_list */
int queued[2];
/* currently allocated requests */
int allocated[2];
/* fifo list of requests in sort_list */
struct list_head fifo;
/* time when queue got scheduled in to dispatch first request. */
u64 dispatch_start;
u64 allocated_slice;
u64 slice_dispatch;
/* time when first request from queue completed and slice started. */
u64 slice_start;
u64 slice_end;
s64 slice_resid;
u64 epoch_start;
u64 epoch_slice;///xx
/* pending priority requests */
int prio_pending;
/* number of requests that are on the dispatch list or inside driver */
int dispatched;
/* io prio of this group */
unsigned short ioprio, org_ioprio;
unsigned short ioprio_class, org_ioprio_class;
pid_t pid;
u32 seek_history;
sector_t last_request_pos;
struct cfq_rb_root *service_tree;
struct cfq_queue *new_cfqq;
struct cfq_group *cfqg;
/* Number of sectors dispatched from queue in single dispatch round */
unsigned long nr_sectors;
};
/*
* First index in the service_trees.
* IDLE is handled separately, so it has negative index
*/
enum wl_class_t {
BE_WORKLOAD = 0,
RT_WORKLOAD = 1,
IDLE_WORKLOAD = 2,
CFQ_PRIO_NR,
};
/*
* Second index in the service_trees.
*/
enum wl_type_t {
ASYNC_WORKLOAD = 0,
SYNC_NOIDLE_WORKLOAD = 1,
SYNC_WORKLOAD = 2
};
struct cfqg_stats {};
/* This is per cgroup per device grouping structure */
struct cfq_group {
/* must be the first member */
struct blkg_policy_data pd;
/* group service_tree member */
struct rb_node rb_node;
/* group service_tree key */
u64 vdisktime;
/*
* The number of active cfqgs and sum of their weights under this
* cfqg. This covers this cfqg's leaf_weight and all children's
* weights, but does not cover weights of further descendants.
*
* If a cfqg is on the service tree, it's active. An active cfqg
* also activates its parent and contributes to the children_weight
* of the parent.
*/
int nr_active;
unsigned int children_weight;
/*
* vfraction is the fraction of vdisktime that the tasks in this
* cfqg are entitled to. This is determined by compounding the
* ratios walking up from this cfqg to the root.
*
* It is in fixed point w/ CFQ_SERVICE_SHIFT and the sum of all
* vfractions on a service tree is approximately 1. The sum may
* deviate a bit due to rounding errors and fluctuations caused by
* cfqgs entering and leaving the service tree.
*/
unsigned int vfraction;
/*
* There are two weights - (internal) weight is the weight of this
* cfqg against the sibling cfqgs. leaf_weight is the wight of
* this cfqg against the child cfqgs. For the root cfqg, both
* weights are kept in sync for backward compatibility.
*/
unsigned int weight;
unsigned int new_weight;
unsigned int dev_weight;
unsigned int leaf_weight;
unsigned int new_leaf_weight;
unsigned int dev_leaf_weight;
/* number of cfqq currently on this group */
int nr_cfqq;
/*
* Per group busy queues average. Useful for workload slice calc. We
* create the array for each prio class but at run time it is used
* only for RT and BE class and slot for IDLE class remains unused.
* This is primarily done to avoid confusion and a gcc warning.
*/
unsigned int busy_queues_avg[CFQ_PRIO_NR];
/*
* rr lists of queues with requests. We maintain service trees for
* RT and BE classes. These trees are subdivided in subclasses
* of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE
* class there is no subclassification and all the cfq queues go on
* a single tree service_tree_idle.
* Counts are embedded in the cfq_rb_root
*/
struct cfq_rb_root service_trees[2][3];
struct cfq_rb_root service_tree_idle;
u64 saved_wl_slice;
enum wl_type_t saved_wl_type;
enum wl_class_t saved_wl_class;
/* number of requests that are on the dispatch list or inside driver */
int dispatched;
struct cfqg_stats stats; /* stats for this cfqg */
/* async queue for each priority case */
struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
struct cfq_queue *async_idle_cfqq;
};
struct cfq_io_cq {
struct io_cq icq; /* must be the first member */
struct cfq_queue *cfqq[2];
int ioprio; /* the current ioprio */
};
/*
* Per block device queue structure
*/
struct cfq_data {
struct request_queue *queue;
/* Root service tree for cfq_groups */
struct cfq_rb_root grp_service_tree;
struct cfq_group *root_group;
/*
* The priority currently being served
*/
enum wl_class_t serving_wl_class;
enum wl_type_t serving_wl_type;
u64 workload_expires;
struct cfq_group *serving_group;
/*
* Each priority tree is sorted by next_request position. These
* trees are used when determining if two or more queues are
* interleaving requests (see cfq_close_cooperator).
*/
struct rb_root prio_trees[CFQ_PRIO_LISTS];
unsigned int busy_queues;
unsigned int busy_sync_queues;
int rq_in_driver;
int rq_in_flight[2];
/*
* queue-depth detection
*/
int rq_queued;
int hw_tag;
/*
* hw_tag can be
* -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection)
* 1 => NCQ is present (hw_tag_est_depth is the estimated max depth)
* 0 => no NCQ
*/
int hw_tag_est_depth;
unsigned int hw_tag_samples;
/*
* idle window management
*/
struct hrtimer idle_slice_timer;
struct work_struct unplug_work;
struct cfq_queue *active_queue;
struct cfq_io_cq *active_cic;
sector_t last_position;
/*
* tunables, see top of file
*/
unsigned int cfq_alpha;///xx - alpha threshold, for calc anticipation.
unsigned int cfq_tservice;///xx - Tservice, for calc anticipation.
unsigned int cfq_tsrv_nr;///xx - nummer of services for anticipating task
unsigned int cfq_is_anti;///xx
u64 cfq_tsrv_start;
u64 cfq_tsrv_sum;///xx - sum of service time for anticipating task
unsigned int cfq_quantum;
unsigned int cfq_slice_async_rq;
unsigned int cfq_latency;
u64 cfq_fifo_expire[2];
u64 cfq_slice[2];
u64 cfq_slice_idle;
u64 cfq_group_idle;
u64 cfq_target_latency;
/*
* Fallback dummy cfqq for extreme OOM conditions
*/
struct cfq_queue oom_cfqq;
u64 last_delayed_sync;
};
static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
static void cfq_put_queue(struct cfq_queue *cfqq);
static struct cfq_rb_root *st_for(struct cfq_group *cfqg,
enum wl_class_t class,
enum wl_type_t type)
{
if (!cfqg)
return NULL;
if (class == IDLE_WORKLOAD)
return &cfqg->service_tree_idle;
return &cfqg->service_trees[class][type];
}
enum cfqq_state_flags {
CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */
CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */
CFQ_CFQQ_FLAG_must_dispatch, /* must be allowed a dispatch */
CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */
CFQ_CFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */
CFQ_CFQQ_FLAG_idle_window, /* slice idling enabled */
CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */
CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */
CFQ_CFQQ_FLAG_sync, /* synchronous queue */
CFQ_CFQQ_FLAG_coop, /* cfqq is shared */
CFQ_CFQQ_FLAG_split_coop, /* shared cfqq will be splitted */
CFQ_CFQQ_FLAG_deep, /* sync cfqq experienced large depth */
CFQ_CFQQ_FLAG_wait_busy, /* Waiting for next request */
};
#define CFQ_CFQQ_FNS(name) \
static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq) \
{ \
(cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name); \
} \
static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq) \
{ \
(cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name); \
} \
static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq) \
{ \
return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0; \
}
CFQ_CFQQ_FNS(on_rr);
CFQ_CFQQ_FNS(wait_request);
CFQ_CFQQ_FNS(must_dispatch);
CFQ_CFQQ_FNS(must_alloc_slice);
CFQ_CFQQ_FNS(fifo_expire);
CFQ_CFQQ_FNS(idle_window);
CFQ_CFQQ_FNS(prio_changed);
CFQ_CFQQ_FNS(slice_new);
CFQ_CFQQ_FNS(sync);
CFQ_CFQQ_FNS(coop);
CFQ_CFQQ_FNS(split_coop);
CFQ_CFQQ_FNS(deep);
CFQ_CFQQ_FNS(wait_busy);
#undef CFQ_CFQQ_FNS
#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c " fmt, (cfqq)->pid, \
cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\
##args)
#define cfq_log(cfqd, fmt, args...) \
blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
/* Traverses through cfq group service trees */
#define for_each_cfqg_st(cfqg, i, j, st) \
for (i = 0; i <= IDLE_WORKLOAD; i++) \
for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\
: &cfqg->service_tree_idle; \
(i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \
(i == IDLE_WORKLOAD && j == 0); \
j++, st = i < IDLE_WORKLOAD ? \
&cfqg->service_trees[i][j]: NULL) \
static inline bool iops_mode(struct cfq_data *cfqd)
{
/*
* If we are not idling on queues and it is a NCQ drive, parallel
* execution of requests is on and measuring time is not possible
* in most of the cases until and unless we drive shallower queue
* depths and that becomes a performance bottleneck. In such cases
* switch to start providing fairness in terms of number of IOs.
*/
if (!cfqd->cfq_slice_idle && cfqd->hw_tag)
return true;
else
return false;
}
static inline enum wl_class_t cfqq_class(struct cfq_queue *cfqq)
{
if (cfq_class_idle(cfqq))
return IDLE_WORKLOAD;
if (cfq_class_rt(cfqq))
return RT_WORKLOAD;
return BE_WORKLOAD;
}
static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)
{
if (!cfq_cfqq_sync(cfqq))
return ASYNC_WORKLOAD;
if (!cfq_cfqq_idle_window(cfqq))
return SYNC_NOIDLE_WORKLOAD;
return SYNC_WORKLOAD;
}
static inline int cfq_group_busy_queues_wl(enum wl_class_t wl_class,
struct cfq_data *cfqd,
struct cfq_group *cfqg)
{
if (wl_class == IDLE_WORKLOAD)
return cfqg->service_tree_idle.count;
return cfqg->service_trees[wl_class][ASYNC_WORKLOAD].count +
cfqg->service_trees[wl_class][SYNC_NOIDLE_WORKLOAD].count +
cfqg->service_trees[wl_class][SYNC_WORKLOAD].count;
}
static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
struct cfq_group *cfqg)
{
return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count +
cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;
}
static void cfq_dispatch_insert(struct request_queue *, struct request *);
static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync,
struct cfq_io_cq *cic, struct bio *bio);
static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq)
{
/* cic->icq is the first member, %NULL will convert to %NULL */
return container_of(icq, struct cfq_io_cq, icq);
}
static inline struct cfq_io_cq *cfq_cic_lookup(struct cfq_data *cfqd,
struct io_context *ioc)
{
if (ioc)
return icq_to_cic(ioc_lookup_icq(ioc, cfqd->queue));
return NULL;
}
static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_cq *cic, bool is_sync)
{
return cic->cfqq[is_sync];
}
static inline void cic_set_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq,
bool is_sync)
{
cic->cfqq[is_sync] = cfqq;
}
static inline struct cfq_data *cic_to_cfqd(struct cfq_io_cq *cic)
{
return cic->icq.q->elevator->elevator_data;
}
/*
* scheduler run of queue, if there are requests pending and no one in the
* driver that will restart queueing
*/
static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
{
if (cfqd->busy_queues) {
cfq_log(cfqd, "schedule dispatch");
kblockd_schedule_work(&cfqd->unplug_work);
}
}
/*
* Scale schedule slice based on io priority. Use the sync time slice only
* if a queue is marked sync and has sync io queued. A sync queue with async
* io only, should not get full sync slice length.
*/
static inline u64 cfq_prio_slice(struct cfq_data *cfqd, bool sync,
unsigned short prio)
{
u64 base_slice = cfqd->cfq_slice[sync];
u64 slice = div_u64(base_slice, CFQ_SLICE_SCALE);
WARN_ON(prio >= IOPRIO_BE_NR);
return base_slice + (slice * (4 - prio));
}
static inline u64
cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
}
/**
* cfqg_scale_charge - scale disk time charge according to cfqg weight
* @charge: disk time being charged
* @vfraction: vfraction of the cfqg, fixed point w/ CFQ_SERVICE_SHIFT
*
* Scale @charge according to @vfraction, which is in range (0, 1]. The
* scaling is inversely proportional.
*
* scaled = charge / vfraction
*
* The result is also in fixed point w/ CFQ_SERVICE_SHIFT.
*/
static inline u64 cfqg_scale_charge(u64 charge,
unsigned int vfraction)
{
u64 c = charge << CFQ_SERVICE_SHIFT; /* make it fixed point */
/* charge / vfraction */
c <<= CFQ_SERVICE_SHIFT;
return div_u64(c, vfraction);
}
/*
* get averaged number of queues of RT/BE priority.
* average is updated, with a formula that gives more weight to higher numbers,
* to quickly follows sudden increases and decrease slowly
*/
static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
struct cfq_group *cfqg, bool rt)
{
unsigned min_q, max_q;
unsigned mult = cfq_hist_divisor - 1;
unsigned round = cfq_hist_divisor / 2;
unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg);
min_q = min(cfqg->busy_queues_avg[rt], busy);
max_q = max(cfqg->busy_queues_avg[rt], busy);
cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) /
cfq_hist_divisor;
return cfqg->busy_queues_avg[rt];
}
static inline u64
cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
{
return cfqd->cfq_target_latency * cfqg->vfraction >> CFQ_SERVICE_SHIFT;
}
static inline u64
cfq_scaled_cfqq_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
u64 slice = cfq_prio_to_slice(cfqd, cfqq);
if (cfqd->cfq_latency) {
/*
* interested queues (we consider only the ones with the same
* priority class in the cfq group)
*/
unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg,
cfq_class_rt(cfqq));
u64 sync_slice = cfqd->cfq_slice[1];
u64 expect_latency = sync_slice * iq;
u64 group_slice = cfq_group_slice(cfqd, cfqq->cfqg);
if (expect_latency > group_slice) {
u64 base_low_slice = 2 * cfqd->cfq_slice_idle;
u64 low_slice;
/* scale low_slice according to IO priority
* and sync vs async */
low_slice = div64_u64(base_low_slice*slice, sync_slice);
low_slice = min(slice, low_slice);
/* the adapted slice value is scaled to fit all iqs
* into the target latency */
slice = div64_u64(slice*group_slice, expect_latency);
slice = max(slice, low_slice);
}
}
return slice;
}
static inline void
cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
u64 slice = cfq_scaled_cfqq_slice(cfqd, cfqq);
u64 now = ktime_get_ns();
cfqq->slice_start = now;
cfqq->slice_end = now + slice;
cfqq->allocated_slice = slice;
cfq_log_cfqq(cfqd, cfqq, "set_slice=%llu", cfqq->slice_end - now);
}
/*
* We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end
* isn't valid until the first request from the dispatch is activated
* and the slice time set.
*/
static inline bool cfq_slice_used(struct cfq_queue *cfqq)
{
if (cfq_cfqq_slice_new(cfqq))
return false;
if (ktime_get_ns() < cfqq->slice_end)
return false;
return true;
}
/*
* Lifted from AS - choose which of rq1 and rq2 that is best served now.
* We choose the request that is closest to the head right now. Distance
* behind the head is penalized and only allowed to a certain extent.
*/
static struct request *
cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last)
{
if (rq1 == NULL || rq1 == rq2)
return rq2;
if (rq2 == NULL)
return rq1;
if (rq_is_sync(rq1) != rq_is_sync(rq2))
return rq_is_sync(rq1) ? rq1 : rq2;
if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_PRIO)
return rq1->cmd_flags & REQ_PRIO ? rq1 : rq2;
//////xx - read performance
if (req_op(rq1) == req_op(rq2))
return rq1;
else
return op_is_write(req_op(rq1)) ? rq2 : rq1;
//////xx
}
static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
{
/* Service tree is empty */
if (!root->count)
return NULL;
return rb_entry(rb_first_cached(&root->rb), struct cfq_queue, rb_node);
}
static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)
{
return rb_entry_cfqg(rb_first_cached(&root->rb));
}
static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
{
if (root->rb_rightmost == n)
root->rb_rightmost = rb_prev(n);
rb_erase_cached(n, &root->rb);
RB_CLEAR_NODE(n);
--root->count;
}
/*
* would be nice to take fifo expire time into account as well
*/
static struct request *
cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
struct request *last)
{
struct rb_node *rbnext = rb_next(&last->rb_node);
struct rb_node *rbprev = rb_prev(&last->rb_node);
struct request *next = NULL, *prev = NULL;
BUG_ON(RB_EMPTY_NODE(&last->rb_node));
if (rbprev)
prev = rb_entry_rq(rbprev);
if (rbnext)
next = rb_entry_rq(rbnext);
else {
rbnext = rb_first(&cfqq->sort_list);
if (rbnext && rbnext != &last->rb_node)
next = rb_entry_rq(rbnext);
}
return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
}
static u64 cfq_slice_offset(struct cfq_data *cfqd,
struct cfq_queue *cfqq)
{
/*
* just an approximation, should be ok.
*/
return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
}
static inline s64
cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)
{
return cfqg->vdisktime - st->min_vdisktime;
}
static void
__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
{
struct rb_node **node = &st->rb.rb_root.rb_node;
struct rb_node *parent = NULL;
struct cfq_group *__cfqg;
s64 key = cfqg_key(st, cfqg);
bool leftmost = true, rightmost = true;
while (*node != NULL) {
parent = *node;
__cfqg = rb_entry_cfqg(parent);
if (key < cfqg_key(st, __cfqg)) {
node = &parent->rb_left;
rightmost = false;
} else {
node = &parent->rb_right;
leftmost = false;
}
}
if (rightmost)
st->rb_rightmost = &cfqg->rb_node;
rb_link_node(&cfqg->rb_node, parent, node);
rb_insert_color_cached(&cfqg->rb_node, &st->rb, leftmost);
}
static void
cfq_update_group_leaf_weight(struct cfq_group *cfqg)
{
BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
if (cfqg->new_leaf_weight) {
cfqg->leaf_weight = cfqg->new_leaf_weight;
cfqg->new_leaf_weight = 0;
}
}
static void
cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
{
unsigned int vfr = 1 << CFQ_SERVICE_SHIFT; /* start with 1 */
struct cfq_group *pos = cfqg;
bool propagate;
/* add to the service tree */
BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
/*
* Update leaf_weight. We cannot update weight at this point
* because cfqg might already have been activated and is
* contributing its current weight to the parent's child_weight.
*/
cfq_update_group_leaf_weight(cfqg);
__cfq_group_service_tree_add(st, cfqg);
/*
* Activate @cfqg and calculate the portion of vfraction @cfqg is
* entitled to. vfraction is calculated by walking the tree
* towards the root calculating the fraction it has at each level.
* The compounded ratio is how much vfraction @cfqg owns.
*
* Start with the proportion tasks in this cfqg has against active
* children cfqgs - its leaf_weight against children_weight.
*/
propagate = !pos->nr_active++;
pos->children_weight += pos->leaf_weight;
vfr = vfr * pos->leaf_weight / pos->children_weight;
cfqg->vfraction = max_t(unsigned, vfr, 1);
}
static inline u64 cfq_get_cfqg_vdisktime_delay(struct cfq_data *cfqd)
{
if (!iops_mode(cfqd))
return CFQ_SLICE_MODE_GROUP_DELAY;
else
return CFQ_IOPS_MODE_GROUP_DELAY;
}
static void
cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
{
struct cfq_rb_root *st = &cfqd->grp_service_tree;
struct cfq_group *__cfqg;
struct rb_node *n;
cfqg->nr_cfqq++;
if (!RB_EMPTY_NODE(&cfqg->rb_node))
return;
/*
* Currently put the group at the end. Later implement something
* so that groups get lesser vtime based on their weights, so that
* if group does not loose all if it was not continuously backlogged.
*/
n = st->rb_rightmost;
if (n) {
__cfqg = rb_entry_cfqg(n);
cfqg->vdisktime = __cfqg->vdisktime +
cfq_get_cfqg_vdisktime_delay(cfqd);
} else
cfqg->vdisktime = st->min_vdisktime;
cfq_group_service_tree_add(st, cfqg);
}
static void
cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg)
{
struct cfq_group *pos = cfqg;
bool propagate;
/*
* Undo activation from cfq_group_service_tree_add(). Deactivate
* @cfqg and propagate deactivation upwards.
*/
propagate = !--pos->nr_active;
pos->children_weight -= pos->leaf_weight;
while (propagate) {
/* @pos has 0 nr_active at this point */
WARN_ON_ONCE(pos->children_weight);
pos->vfraction = 0;
break;
}
/* remove from the service tree */
if (!RB_EMPTY_NODE(&cfqg->rb_node))
cfq_rb_erase(&cfqg->rb_node, st);
}
static void
cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
{
struct cfq_rb_root *st = &cfqd->grp_service_tree;
BUG_ON(cfqg->nr_cfqq < 1);
cfqg->nr_cfqq--;
/* If there are other cfq queues under this group, don't delete it */
if (cfqg->nr_cfqq)
return;
cfq_group_service_tree_del(st, cfqg);
cfqg->saved_wl_slice = 0;
}
static inline u64 cfq_cfqq_slice_usage(struct cfq_queue *cfqq,
u64 *unaccounted_time)
{
u64 slice_used;
u64 now = ktime_get_ns();
/*
* Queue got expired before even a single request completed or
* got expired immediately after first request completion.
*/
if (!cfqq->slice_start || cfqq->slice_start == now) {
/*
* Also charge the seek time incurred to the group, otherwise
* if there are mutiple queues in the group, each can dispatch
* a single request on seeky media and cause lots of seek time
* and group will never know it.
*/
slice_used = max_t(u64, (now - cfqq->dispatch_start),
jiffies_to_nsecs(1));
} else {
slice_used = now - cfqq->slice_start;
if (slice_used > cfqq->allocated_slice) {
*unaccounted_time = slice_used - cfqq->allocated_slice;
slice_used = cfqq->allocated_slice;
}
if (cfqq->slice_start > cfqq->dispatch_start)
*unaccounted_time += cfqq->slice_start -
cfqq->dispatch_start;
}
return slice_used;
}
static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
struct cfq_queue *cfqq)
{
struct cfq_rb_root *st = &cfqd->grp_service_tree;
u64 used_sl, charge, unaccounted_sl = 0;
int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
- cfqg->service_tree_idle.count;
unsigned int vfr;
u64 now = ktime_get_ns();
BUG_ON(nr_sync < 0);
used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl);
if (iops_mode(cfqd))
charge = cfqq->slice_dispatch;
else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
charge = cfqq->allocated_slice;
/*
* Can't update vdisktime while on service tree and cfqg->vfraction
* is valid only while on it. Cache vfr, leave the service tree,
* update vdisktime and go back on. The re-addition to the tree
* will also update the weights as necessary.
*/
vfr = cfqg->vfraction;
cfq_group_service_tree_del(st, cfqg);
cfqg->vdisktime += cfqg_scale_charge(charge, vfr);
cfq_group_service_tree_add(st, cfqg);
/* This group is being expired. Save the context */
if (cfqd->workload_expires > now) {
cfqg->saved_wl_slice = cfqd->workload_expires - now;
cfqg->saved_wl_type = cfqd->serving_wl_type;
cfqg->saved_wl_class = cfqd->serving_wl_class;
} else
cfqg->saved_wl_slice = 0;
cfq_log_cfqq(cfqq->cfqd, cfqq,
"sl_used=%llu disp=%llu charge=%llu iops=%u sect=%lu",
used_sl, cfqq->slice_dispatch, charge,
iops_mode(cfqd), cfqq->nr_sectors);
}
/**
* cfq_init_cfqg_base - initialize base part of a cfq_group
* @cfqg: cfq_group to initialize
*
* Initialize the base part which is used whether %CONFIG_CFQ_GROUP_IOSCHED
* is enabled or not.
*/
static void cfq_init_cfqg_base(struct cfq_group *cfqg)
{
struct cfq_rb_root *st;
int i, j;
for_each_cfqg_st(cfqg, i, j, st)
*st = CFQ_RB_ROOT;
RB_CLEAR_NODE(&cfqg->rb_node);
}
static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd,
struct blkcg *blkcg)
{
return cfqd->root_group;
}
static inline void
cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
cfqq->cfqg = cfqg;
}