target/linux/generic/backport-5.10/630-v5.15-page_pool_frag_support.patch

   1 --- a/include/net/page_pool.h
   2 +++ b/include/net/page_pool.h
   3 @@ -45,7 +45,10 @@
   4                                         * Please note DMA-sync-for-CPU is still
   5                                         * device driver responsibility
   6                                         */
   7 -#define PP_FLAG_ALL            (PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV)
   8 +#define PP_FLAG_PAGE_FRAG      BIT(2) /* for page frag feature */
   9 +#define PP_FLAG_ALL            (PP_FLAG_DMA_MAP |\
  10 +                                PP_FLAG_DMA_SYNC_DEV |\
  11 +                                PP_FLAG_PAGE_FRAG)
  12
  13  /*
  14   * Fast allocation side cache array/stack
  15 @@ -65,7 +68,7 @@
  16  #define PP_ALLOC_CACHE_REFILL  64
  17  struct pp_alloc_cache {
  18         u32 count;
  19 -       void *cache[PP_ALLOC_CACHE_SIZE];
  20 +       struct page *cache[PP_ALLOC_CACHE_SIZE];
  21  };
  22
  23  struct page_pool_params {
  24 @@ -79,6 +82,22 @@ struct page_pool_params {
  25         unsigned int    offset;  /* DMA addr offset */
  26  };
  27
  28 +
  29 +static inline int page_pool_ethtool_stats_get_count(void)
  30 +{
  31 +       return 0;
  32 +}
  33 +
  34 +static inline u8 *page_pool_ethtool_stats_get_strings(u8 *data)
  35 +{
  36 +       return data;
  37 +}
  38 +
  39 +static inline u64 *page_pool_ethtool_stats_get(u64 *data, void *stats)
  40 +{
  41 +       return data;
  42 +}
  43 +
  44  struct page_pool {
  45         struct page_pool_params p;
  46
  47 @@ -88,6 +107,9 @@ struct page_pool {
  48         unsigned long defer_warn;
  49
  50         u32 pages_state_hold_cnt;
  51 +       unsigned int frag_offset;
  52 +       struct page *frag_page;
  53 +       long frag_users;
  54
  55         /*
  56          * Data structure for allocation side
  57 @@ -137,6 +159,18 @@ static inline struct page *page_pool_dev
  58         return page_pool_alloc_pages(pool, gfp);
  59  }
  60
  61 +struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset,
  62 +                                 unsigned int size, gfp_t gfp);
  63 +
  64 +static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool,
  65 +                                                   unsigned int *offset,
  66 +                                                   unsigned int size)
  67 +{
  68 +       gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN);
  69 +
  70 +       return page_pool_alloc_frag(pool, offset, size, gfp);
  71 +}
  72 +
  73  /* get the stored dma direction. A driver might decide to treat this locally and
  74   * avoid the extra cache line from page_pool to determine the direction
  75   */
  76 @@ -146,6 +180,8 @@ inline enum dma_data_direction page_pool
  77         return pool->p.dma_dir;
  78  }
  79
  80 +bool page_pool_return_skb_page(struct page *page);
  81 +
  82  struct page_pool *page_pool_create(const struct page_pool_params *params);
  83
  84  #ifdef CONFIG_PAGE_POOL
  85 @@ -165,6 +201,7 @@ static inline void page_pool_release_pag
  86                                           struct page *page)
  87  {
  88  }
  89 +
  90  #endif
  91
  92  void page_pool_put_page(struct page_pool *pool, struct page *page,
  93 @@ -189,19 +226,48 @@ static inline void page_pool_recycle_dir
  94         page_pool_put_full_page(pool, page, true);
  95  }
  96
  97 +#define PAGE_POOL_DMA_USE_PP_FRAG_COUNT        \
  98 +               (sizeof(dma_addr_t) > sizeof(unsigned long))
  99 +
 100  static inline dma_addr_t page_pool_get_dma_addr(struct page *page)
 101  {
 102 -       dma_addr_t ret = page->dma_addr[0];
 103 -       if (sizeof(dma_addr_t) > sizeof(unsigned long))
 104 -               ret |= (dma_addr_t)page->dma_addr[1] << 16 << 16;
 105 +       dma_addr_t ret = page->dma_addr;
 106 +
 107 +       if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT)
 108 +               ret |= (dma_addr_t)page->dma_addr_upper << 16 << 16;
 109 +
 110         return ret;
 111  }
 112
 113  static inline void page_pool_set_dma_addr(struct page *page, dma_addr_t addr)
 114  {
 115 -       page->dma_addr[0] = addr;
 116 -       if (sizeof(dma_addr_t) > sizeof(unsigned long))
 117 -               page->dma_addr[1] = upper_32_bits(addr);
 118 +       page->dma_addr = addr;
 119 +       if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT)
 120 +               page->dma_addr_upper = upper_32_bits(addr);
 121 +}
 122 +
 123 +static inline void page_pool_set_frag_count(struct page *page, long nr)
 124 +{
 125 +       atomic_long_set(&page->pp_frag_count, nr);
 126 +}
 127 +
 128 +static inline long page_pool_atomic_sub_frag_count_return(struct page *page,
 129 +                                                         long nr)
 130 +{
 131 +       long ret;
 132 +
 133 +       /* As suggested by Alexander, atomic_long_read() may cover up the
 134 +        * reference count errors, so avoid calling atomic_long_read() in
 135 +        * the cases of freeing or draining the page_frags, where we would
 136 +        * not expect it to match or that are slowpath anyway.
 137 +        */
 138 +       if (__builtin_constant_p(nr) &&
 139 +           atomic_long_read(&page->pp_frag_count) == nr)
 140 +               return 0;
 141 +
 142 +       ret = atomic_long_sub_return(nr, &page->pp_frag_count);
 143 +       WARN_ON(ret < 0);
 144 +       return ret;
 145  }
 146
 147  static inline bool is_page_pool_compiled_in(void)
 148 @@ -225,4 +291,23 @@ static inline void page_pool_nid_changed
 149         if (unlikely(pool->p.nid != new_nid))
 150                 page_pool_update_nid(pool, new_nid);
 151  }
 152 +
 153 +static inline void page_pool_ring_lock(struct page_pool *pool)
 154 +       __acquires(&pool->ring.producer_lock)
 155 +{
 156 +       if (in_serving_softirq())
 157 +               spin_lock(&pool->ring.producer_lock);
 158 +       else
 159 +               spin_lock_bh(&pool->ring.producer_lock);
 160 +}
 161 +
 162 +static inline void page_pool_ring_unlock(struct page_pool *pool)
 163 +       __releases(&pool->ring.producer_lock)
 164 +{
 165 +       if (in_serving_softirq())
 166 +               spin_unlock(&pool->ring.producer_lock);
 167 +       else
 168 +               spin_unlock_bh(&pool->ring.producer_lock);
 169 +}
 170 +
 171  #endif /* _NET_PAGE_POOL_H */
 172 --- a/net/core/page_pool.c
 173 +++ b/net/core/page_pool.c
 174 @@ -11,16 +11,22 @@
 175  #include <linux/device.h>
 176
 177  #include <net/page_pool.h>
 178 +#include <net/xdp.h>
 179 +
 180  #include <linux/dma-direction.h>
 181  #include <linux/dma-mapping.h>
 182  #include <linux/page-flags.h>
 183  #include <linux/mm.h> /* for __put_page() */
 184 +#include <linux/poison.h>
 185 +#include <linux/ethtool.h>
 186
 187  #include <trace/events/page_pool.h>
 188
 189  #define DEFER_TIME (msecs_to_jiffies(1000))
 190  #define DEFER_WARN_INTERVAL (60 * HZ)
 191
 192 +#define BIAS_MAX       LONG_MAX
 193 +
 194  static int page_pool_init(struct page_pool *pool,
 195                           const struct page_pool_params *params)
 196  {
 197 @@ -64,6 +70,10 @@ static int page_pool_init(struct page_po
 198                  */
 199         }
 200
 201 +       if (PAGE_POOL_DMA_USE_PP_FRAG_COUNT &&
 202 +           pool->p.flags & PP_FLAG_PAGE_FRAG)
 203 +               return -EINVAL;
 204 +
 205         if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0)
 206                 return -ENOMEM;
 207
 208 @@ -180,40 +190,10 @@ static void page_pool_dma_sync_for_devic
 209                                          pool->p.dma_dir);
 210  }
 211
 212 -/* slow path */
 213 -noinline
 214 -static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
 215 -                                                gfp_t _gfp)
 216 +static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
 217  {
 218 -       struct page *page;
 219 -       gfp_t gfp = _gfp;
 220         dma_addr_t dma;
 221
 222 -       /* We could always set __GFP_COMP, and avoid this branch, as
 223 -        * prep_new_page() can handle order-0 with __GFP_COMP.
 224 -        */
 225 -       if (pool->p.order)
 226 -               gfp |= __GFP_COMP;
 227 -
 228 -       /* FUTURE development:
 229 -        *
 230 -        * Current slow-path essentially falls back to single page
 231 -        * allocations, which doesn't improve performance.  This code
 232 -        * need bulk allocation support from the page allocator code.
 233 -        */
 234 -
 235 -       /* Cache was empty, do real allocation */
 236 -#ifdef CONFIG_NUMA
 237 -       page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
 238 -#else
 239 -       page = alloc_pages(gfp, pool->p.order);
 240 -#endif
 241 -       if (!page)
 242 -               return NULL;
 243 -
 244 -       if (!(pool->p.flags & PP_FLAG_DMA_MAP))
 245 -               goto skip_dma_map;
 246 -
 247         /* Setup DMA mapping: use 'struct page' area for storing DMA-addr
 248          * since dma_addr_t can be either 32 or 64 bits and does not always fit
 249          * into page private data (i.e 32bit cpu with 64bit DMA caps)
 250 @@ -222,22 +202,53 @@ static struct page *__page_pool_alloc_pa
 251         dma = dma_map_page_attrs(pool->p.dev, page, 0,
 252                                  (PAGE_SIZE << pool->p.order),
 253                                  pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC);
 254 -       if (dma_mapping_error(pool->p.dev, dma)) {
 255 -               put_page(page);
 256 -               return NULL;
 257 -       }
 258 +       if (dma_mapping_error(pool->p.dev, dma))
 259 +               return false;
 260 +
 261         page_pool_set_dma_addr(page, dma);
 262
 263         if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
 264                 page_pool_dma_sync_for_device(pool, page, pool->p.max_len);
 265
 266 -skip_dma_map:
 267 +       return true;
 268 +}
 269 +
 270 +static void page_pool_set_pp_info(struct page_pool *pool,
 271 +                                 struct page *page)
 272 +{
 273 +       page->pp = pool;
 274 +       page->pp_magic |= PP_SIGNATURE;
 275 +}
 276 +
 277 +static void page_pool_clear_pp_info(struct page *page)
 278 +{
 279 +       page->pp_magic = 0;
 280 +       page->pp = NULL;
 281 +}
 282 +
 283 +/* slow path */
 284 +noinline
 285 +static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
 286 +                                                gfp_t gfp)
 287 +{
 288 +       struct page *page;
 289 +
 290 +       gfp |= __GFP_COMP;
 291 +       page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
 292 +       if (unlikely(!page))
 293 +               return NULL;
 294 +
 295 +       if ((pool->p.flags & PP_FLAG_DMA_MAP) &&
 296 +           unlikely(!page_pool_dma_map(pool, page))) {
 297 +               put_page(page);
 298 +               return NULL;
 299 +       }
 300 +
 301 +       page_pool_set_pp_info(pool, page);
 302 +
 303         /* Track how many pages are held 'in-flight' */
 304         pool->pages_state_hold_cnt++;
 305 -
 306         trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt);
 307 -
 308 -       /* When page just alloc'ed is should/must have refcnt 1. */
 309         return page;
 310  }
 311
 312 @@ -302,10 +313,12 @@ void page_pool_release_page(struct page_
 313                              DMA_ATTR_SKIP_CPU_SYNC);
 314         page_pool_set_dma_addr(page, 0);
 315  skip_dma_unmap:
 316 +       page_pool_clear_pp_info(page);
 317 +
 318         /* This may be the last page returned, releasing the pool, so
 319          * it is not safe to reference pool afterwards.
 320          */
 321 -       count = atomic_inc_return(&pool->pages_state_release_cnt);
 322 +       count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt);
 323         trace_page_pool_state_release(pool, page, count);
 324  }
 325  EXPORT_SYMBOL(page_pool_release_page);
 326 @@ -331,7 +344,10 @@ static bool page_pool_recycle_in_ring(st
 327         else
 328                 ret = ptr_ring_produce_bh(&pool->ring, page);
 329
 330 -       return (ret == 0) ? true : false;
 331 +       if (!ret)
 332 +               return true;
 333 +
 334 +       return false;
 335  }
 336
 337  /* Only allow direct recycling in special circumstances, into the
 338 @@ -350,46 +366,43 @@ static bool page_pool_recycle_in_cache(s
 339         return true;
 340  }
 341
 342 -/* page is NOT reusable when:
 343 - * 1) allocated when system is under some pressure. (page_is_pfmemalloc)
 344 - */
 345 -static bool pool_page_reusable(struct page_pool *pool, struct page *page)
 346 -{
 347 -       return !page_is_pfmemalloc(page);
 348 -}
 349 -
 350  /* If the page refcnt == 1, this will try to recycle the page.
 351   * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for
 352   * the configured size min(dma_sync_size, pool->max_len).
 353   * If the page refcnt != 1, then the page will be returned to memory
 354   * subsystem.
 355   */
 356 -void page_pool_put_page(struct page_pool *pool, struct page *page,
 357 -                       unsigned int dma_sync_size, bool allow_direct)
 358 -{
 359 +static __always_inline struct page *
 360 +__page_pool_put_page(struct page_pool *pool, struct page *page,
 361 +                    unsigned int dma_sync_size, bool allow_direct)
 362 +{
 363 +       /* It is not the last user for the page frag case */
 364 +       if (pool->p.flags & PP_FLAG_PAGE_FRAG &&
 365 +           page_pool_atomic_sub_frag_count_return(page, 1))
 366 +               return NULL;
 367 +
 368         /* This allocator is optimized for the XDP mode that uses
 369          * one-frame-per-page, but have fallbacks that act like the
 370          * regular page allocator APIs.
 371          *
 372          * refcnt == 1 means page_pool owns page, and can recycle it.
 373 +        *
 374 +        * page is NOT reusable when allocated when system is under
 375 +        * some pressure. (page_is_pfmemalloc)
 376          */
 377 -       if (likely(page_ref_count(page) == 1 &&
 378 -                  pool_page_reusable(pool, page))) {
 379 +       if (likely(page_ref_count(page) == 1 && !page_is_pfmemalloc(page))) {
 380                 /* Read barrier done in page_ref_count / READ_ONCE */
 381
 382                 if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
 383                         page_pool_dma_sync_for_device(pool, page,
 384                                                       dma_sync_size);
 385
 386 -               if (allow_direct && in_serving_softirq())
 387 -                       if (page_pool_recycle_in_cache(page, pool))
 388 -                               return;
 389 +               if (allow_direct && in_serving_softirq() &&
 390 +                   page_pool_recycle_in_cache(page, pool))
 391 +                       return NULL;
 392
 393 -               if (!page_pool_recycle_in_ring(pool, page)) {
 394 -                       /* Cache full, fallback to free pages */
 395 -                       page_pool_return_page(pool, page);
 396 -               }
 397 -               return;
 398 +               /* Page found as candidate for recycling */
 399 +               return page;
 400         }
 401         /* Fallback/non-XDP mode: API user have elevated refcnt.
 402          *
 403 @@ -407,9 +420,98 @@ void page_pool_put_page(struct page_pool
 404         /* Do not replace this with page_pool_return_page() */
 405         page_pool_release_page(pool, page);
 406         put_page(page);
 407 +
 408 +       return NULL;
 409 +}
 410 +
 411 +void page_pool_put_page(struct page_pool *pool, struct page *page,
 412 +                       unsigned int dma_sync_size, bool allow_direct)
 413 +{
 414 +       page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct);
 415 +       if (page && !page_pool_recycle_in_ring(pool, page))
 416 +               /* Cache full, fallback to free pages */
 417 +               page_pool_return_page(pool, page);
 418  }
 419  EXPORT_SYMBOL(page_pool_put_page);
 420
 421 +static struct page *page_pool_drain_frag(struct page_pool *pool,
 422 +                                        struct page *page)
 423 +{
 424 +       long drain_count = BIAS_MAX - pool->frag_users;
 425 +
 426 +       /* Some user is still using the page frag */
 427 +       if (likely(page_pool_atomic_sub_frag_count_return(page,
 428 +                                                         drain_count)))
 429 +               return NULL;
 430 +
 431 +       if (page_ref_count(page) == 1 && !page_is_pfmemalloc(page)) {
 432 +               if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV)
 433 +                       page_pool_dma_sync_for_device(pool, page, -1);
 434 +
 435 +               return page;
 436 +       }
 437 +
 438 +       page_pool_return_page(pool, page);
 439 +       return NULL;
 440 +}
 441 +
 442 +static void page_pool_free_frag(struct page_pool *pool)
 443 +{
 444 +       long drain_count = BIAS_MAX - pool->frag_users;
 445 +       struct page *page = pool->frag_page;
 446 +
 447 +       pool->frag_page = NULL;
 448 +
 449 +       if (!page ||
 450 +           page_pool_atomic_sub_frag_count_return(page, drain_count))
 451 +               return;
 452 +
 453 +       page_pool_return_page(pool, page);
 454 +}
 455 +
 456 +struct page *page_pool_alloc_frag(struct page_pool *pool,
 457 +                                 unsigned int *offset,
 458 +                                 unsigned int size, gfp_t gfp)
 459 +{
 460 +       unsigned int max_size = PAGE_SIZE << pool->p.order;
 461 +       struct page *page = pool->frag_page;
 462 +
 463 +       if (WARN_ON(!(pool->p.flags & PP_FLAG_PAGE_FRAG) ||
 464 +                   size > max_size))
 465 +               return NULL;
 466 +
 467 +       size = ALIGN(size, dma_get_cache_alignment());
 468 +       *offset = pool->frag_offset;
 469 +
 470 +       if (page && *offset + size > max_size) {
 471 +               page = page_pool_drain_frag(pool, page);
 472 +               if (page)
 473 +                       goto frag_reset;
 474 +       }
 475 +
 476 +       if (!page) {
 477 +               page = page_pool_alloc_pages(pool, gfp);
 478 +               if (unlikely(!page)) {
 479 +                       pool->frag_page = NULL;
 480 +                       return NULL;
 481 +               }
 482 +
 483 +               pool->frag_page = page;
 484 +
 485 +frag_reset:
 486 +               pool->frag_users = 1;
 487 +               *offset = 0;
 488 +               pool->frag_offset = size;
 489 +               page_pool_set_frag_count(page, BIAS_MAX);
 490 +               return page;
 491 +       }
 492 +
 493 +       pool->frag_users++;
 494 +       pool->frag_offset = *offset + size;
 495 +       return page;
 496 +}
 497 +EXPORT_SYMBOL(page_pool_alloc_frag);
 498 +
 499  static void page_pool_empty_ring(struct page_pool *pool)
 500  {
 501         struct page *page;
 502 @@ -515,6 +617,8 @@ void page_pool_destroy(struct page_pool
 503         if (!page_pool_put(pool))
 504                 return;
 505
 506 +       page_pool_free_frag(pool);
 507 +
 508         if (!page_pool_release(pool))
 509                 return;
 510
 511 @@ -541,3 +645,32 @@ void page_pool_update_nid(struct page_po
 512         }
 513  }
 514  EXPORT_SYMBOL(page_pool_update_nid);
 515 +
 516 +bool page_pool_return_skb_page(struct page *page)
 517 +{
 518 +       struct page_pool *pp;
 519 +
 520 +       page = compound_head(page);
 521 +
 522 +       /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation
 523 +        * in order to preserve any existing bits, such as bit 0 for the
 524 +        * head page of compound page and bit 1 for pfmemalloc page, so
 525 +        * mask those bits for freeing side when doing below checking,
 526 +        * and page_is_pfmemalloc() is checked in __page_pool_put_page()
 527 +        * to avoid recycling the pfmemalloc page.
 528 +        */
 529 +       if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE))
 530 +               return false;
 531 +
 532 +       pp = page->pp;
 533 +
 534 +       /* Driver set this to memory recycling info. Reset it on recycle.
 535 +        * This will *not* work for NIC using a split-page memory model.
 536 +        * The page will be returned to the pool here regardless of the
 537 +        * 'flipped' fragment being in use or not.
 538 +        */
 539 +       page_pool_put_full_page(pp, page, false);
 540 +
 541 +       return true;
 542 +}
 543 +EXPORT_SYMBOL(page_pool_return_skb_page);
 544 --- a/include/linux/mm_types.h
 545 +++ b/include/linux/mm_types.h
 546 @@ -97,10 +97,25 @@ struct page {
 547                 };
 548                 struct {        /* page_pool used by netstack */
 549                         /**
 550 -                        * @dma_addr: might require a 64-bit value on
 551 -                        * 32-bit architectures.
 552 +                        * @pp_magic: magic value to avoid recycling non
 553 +                        * page_pool allocated pages.
 554                          */
 555 -                       unsigned long dma_addr[2];
 556 +                       unsigned long pp_magic;
 557 +                       struct page_pool *pp;
 558 +                       unsigned long _pp_mapping_pad;
 559 +                       unsigned long dma_addr;
 560 +                       union {
 561 +                               /**
 562 +                                * dma_addr_upper: might require a 64-bit
 563 +                                * value on 32-bit architectures.
 564 +                                */
 565 +                               unsigned long dma_addr_upper;
 566 +                               /**
 567 +                                * For frag page support, not supported in
 568 +                                * 32-bit architectures with 64-bit DMA.
 569 +                                */
 570 +                               atomic_long_t pp_frag_count;
 571 +                       };
 572                 };
 573                 struct {        /* slab, slob and slub */
 574                         union {
 575 --- a/net/core/skbuff.c
 576 +++ b/net/core/skbuff.c
 577 @@ -594,13 +594,22 @@ static void skb_clone_fraglist(struct sk
 578                 skb_get(list);
 579  }
 580
 581 +static bool skb_pp_recycle(struct sk_buff *skb, void *data)
 582 +{
 583 +       if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)
 584 +               return false;
 585 +       return page_pool_return_skb_page(virt_to_page(data));
 586 +}
 587 +
 588  static void skb_free_head(struct sk_buff *skb)
 589  {
 590         unsigned char *head = skb->head;
 591
 592 -       if (skb->head_frag)
 593 +       if (skb->head_frag) {
 594 +               if (skb_pp_recycle(skb, head))
 595 +                       return;
 596                 skb_free_frag(head);
 597 -       else
 598 +       } else
 599                 kfree(head);
 600  }
 601
 602 @@ -612,16 +621,27 @@ static void skb_release_data(struct sk_b
 603         if (skb->cloned &&
 604             atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1,
 605                               &shinfo->dataref))
 606 -               return;
 607 +               goto exit;
 608
 609         for (i = 0; i < shinfo->nr_frags; i++)
 610 -               __skb_frag_unref(&shinfo->frags[i]);
 611 +               __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);
 612
 613         if (shinfo->frag_list)
 614                 kfree_skb_list(shinfo->frag_list);
 615
 616         skb_zcopy_clear(skb, true);
 617         skb_free_head(skb);
 618 +exit:
 619 +       /* When we clone an SKB we copy the reycling bit. The pp_recycle
 620 +        * bit is only set on the head though, so in order to avoid races
 621 +        * while trying to recycle fragments on __skb_frag_unref() we need
 622 +        * to make one SKB responsible for triggering the recycle path.
 623 +        * So disable the recycling bit if an SKB is cloned and we have
 624 +        * additional references to to the fragmented part of the SKB.
 625 +        * Eventually the last SKB will have the recycling bit set and it's
 626 +        * dataref set to 0, which will trigger the recycling
 627 +        */
 628 +       skb->pp_recycle = 0;
 629  }
 630
 631  /*
 632 @@ -1003,6 +1023,7 @@ static struct sk_buff *__skb_clone(struc
 633         n->nohdr = 0;
 634         n->peeked = 0;
 635         C(pfmemalloc);
 636 +       C(pp_recycle);
 637         n->destructor = NULL;
 638         C(tail);
 639         C(end);
 640 @@ -3421,7 +3442,7 @@ int skb_shift(struct sk_buff *tgt, struc
 641                 fragto = &skb_shinfo(tgt)->frags[merge];
 642
 643                 skb_frag_size_add(fragto, skb_frag_size(fragfrom));
 644 -               __skb_frag_unref(fragfrom);
 645 +               __skb_frag_unref(fragfrom, skb->pp_recycle);
 646         }
 647
 648         /* Reposition in the original skb */
 649 @@ -5188,6 +5209,20 @@ bool skb_try_coalesce(struct sk_buff *to
 650         if (skb_cloned(to))
 651                 return false;
 652
 653 +       /* In general, avoid mixing slab allocated and page_pool allocated
 654 +        * pages within the same SKB. However when @to is not pp_recycle and
 655 +        * @from is cloned, we can transition frag pages from page_pool to
 656 +        * reference counted.
 657 +        *
 658 +        * On the other hand, don't allow coalescing two pp_recycle SKBs if
 659 +        * @from is cloned, in case the SKB is using page_pool fragment
 660 +        * references (PP_FLAG_PAGE_FRAG). Since we only take full page
 661 +        * references for cloned SKBs at the moment that would result in
 662 +        * inconsistent reference counts.
 663 +        */
 664 +       if (to->pp_recycle != (from->pp_recycle && !skb_cloned(from)))
 665 +               return false;
 666 +
 667         if (len <= skb_tailroom(to)) {
 668                 if (len)
 669                         BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len));
 670 --- a/include/linux/skbuff.h
 671 +++ b/include/linux/skbuff.h
 672 @@ -37,6 +37,7 @@
 673  #include <linux/in6.h>
 674  #include <linux/if_packet.h>
 675  #include <net/flow.h>
 676 +#include <net/page_pool.h>
 677  #if IS_ENABLED(CONFIG_NF_CONNTRACK)
 678  #include <linux/netfilter/nf_conntrack_common.h>
 679  #endif
 680 @@ -786,7 +787,8 @@ struct sk_buff {
 681                                 fclone:2,
 682                                 peeked:1,
 683                                 head_frag:1,
 684 -                               pfmemalloc:1;
 685 +                               pfmemalloc:1,
 686 +                               pp_recycle:1; /* page_pool recycle indicator */
 687  #ifdef CONFIG_SKB_EXTENSIONS
 688         __u8                    active_extensions;
 689  #endif
 690 @@ -3029,9 +3031,15 @@ static inline void skb_frag_ref(struct s
 691   *
 692   * Releases a reference on the paged fragment @frag.
 693   */
 694 -static inline void __skb_frag_unref(skb_frag_t *frag)
 695 +static inline void __skb_frag_unref(skb_frag_t *frag, bool recycle)
 696  {
 697 -       put_page(skb_frag_page(frag));
 698 +       struct page *page = skb_frag_page(frag);
 699 +
 700 +#ifdef CONFIG_PAGE_POOL
 701 +       if (recycle && page_pool_return_skb_page(page))
 702 +               return;
 703 +#endif
 704 +       put_page(page);
 705  }
 706
 707  /**
 708 @@ -3043,7 +3051,7 @@ static inline void __skb_frag_unref(skb_
 709   */
 710  static inline void skb_frag_unref(struct sk_buff *skb, int f)
 711  {
 712 -       __skb_frag_unref(&skb_shinfo(skb)->frags[f]);
 713 +       __skb_frag_unref(&skb_shinfo(skb)->frags[f], skb->pp_recycle);
 714  }
 715
 716  /**
 717 @@ -4642,5 +4650,12 @@ static inline u64 skb_get_kcov_handle(st
 718  #endif
 719  }
 720
 721 +#ifdef CONFIG_PAGE_POOL
 722 +static inline void skb_mark_for_recycle(struct sk_buff *skb)
 723 +{
 724 +       skb->pp_recycle = 1;
 725 +}
 726 +#endif
 727 +
 728  #endif /* __KERNEL__ */
 729  #endif /* _LINUX_SKBUFF_H */
 730 --- a/drivers/net/ethernet/marvell/sky2.c
 731 +++ b/drivers/net/ethernet/marvell/sky2.c
 732 @@ -2501,7 +2501,7 @@ static void skb_put_frags(struct sk_buff
 733
 734                 if (length == 0) {
 735                         /* don't need this page */
 736 -                       __skb_frag_unref(frag);
 737 +                       __skb_frag_unref(frag, false);
 738                         --skb_shinfo(skb)->nr_frags;
 739                 } else {
 740                         size = min(length, (unsigned) PAGE_SIZE);
 741 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
 742 +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
 743 @@ -526,7 +526,7 @@ static int mlx4_en_complete_rx_desc(stru
 744  fail:
 745         while (nr > 0) {
 746                 nr--;
 747 -               __skb_frag_unref(skb_shinfo(skb)->frags + nr);
 748 +               __skb_frag_unref(skb_shinfo(skb)->frags + nr, false);
 749         }
 750         return 0;
 751  }
 752 --- a/net/tls/tls_device.c
 753 +++ b/net/tls/tls_device.c
 754 @@ -131,7 +131,7 @@ static void destroy_record(struct tls_re
 755         int i;
 756
 757         for (i = 0; i < record->num_frags; i++)
 758 -               __skb_frag_unref(&record->frags[i]);
 759 +               __skb_frag_unref(&record->frags[i], false);
 760         kfree(record);
 761  }
 762
 763 --- a/include/linux/poison.h
 764 +++ b/include/linux/poison.h
 765 @@ -82,4 +82,7 @@
 766  /********** security/ **********/
 767  #define KEY_DESTROY            0xbd
 768
 769 +/********** net/core/page_pool.c **********/
 770 +#define PP_SIGNATURE           (0x40 + POISON_POINTER_DELTA)
 771 +
 772  #endif
 773 --- a/include/linux/mm.h
 774 +++ b/include/linux/mm.h
 775 @@ -1602,7 +1602,7 @@ static inline bool page_is_pfmemalloc(st
 776          * Page index cannot be this large so this must be
 777          * a pfmemalloc page.
 778          */
 779 -       return page->index == -1UL;
 780 +       return (uintptr_t)page->lru.next & BIT(1);
 781  }
 782
 783  /*
 784 @@ -1611,12 +1611,12 @@ static inline bool page_is_pfmemalloc(st
 785   */
 786  static inline void set_page_pfmemalloc(struct page *page)
 787  {
 788 -       page->index = -1UL;
 789 +       page->lru.next = (void *)BIT(1);
 790  }
 791
 792  static inline void clear_page_pfmemalloc(struct page *page)
 793  {
 794 -       page->index = 0;
 795 +       page->lru.next = NULL;
 796  }
 797
 798  /*