Logo Search packages:      
Sourcecode: db4.1 version File versions

mp_fget.c

/*-
 * See the file LICENSE for redistribution information.
 *
 * Copyright (c) 1996-2002
 *    Sleepycat Software.  All rights reserved.
 */
#include "db_config.h"

#ifndef lint
static const char revid[] = "$Id: mp_fget.c,v 11.68 2002/08/06 04:58:09 bostic Exp $";
#endif /* not lint */

#ifndef NO_SYSTEM_INCLUDES
#include <sys/types.h>

#include <string.h>
#endif

#include "db_int.h"
#include "dbinc/db_shash.h"
#include "dbinc/mp.h"

#ifdef HAVE_FILESYSTEM_NOTZERO
static int __memp_fs_notzero
    __P((DB_ENV *, DB_MPOOLFILE *, MPOOLFILE *, db_pgno_t *));
#endif

/*
 * __memp_fget --
 *    Get a page from the file.
 *
 * PUBLIC: int __memp_fget
 * PUBLIC:     __P((DB_MPOOLFILE *, db_pgno_t *, u_int32_t, void *));
 */
int
__memp_fget(dbmfp, pgnoaddr, flags, addrp)
      DB_MPOOLFILE *dbmfp;
      db_pgno_t *pgnoaddr;
      u_int32_t flags;
      void *addrp;
{
      enum { FIRST_FOUND, FIRST_MISS, SECOND_FOUND, SECOND_MISS } state;
      BH *alloc_bhp, *bhp;
      DB_ENV *dbenv;
      DB_MPOOL *dbmp;
      DB_MPOOL_HASH *hp;
      MPOOL *c_mp, *mp;
      MPOOLFILE *mfp;
      roff_t mf_offset;
      u_int32_t n_cache, st_hsearch;
      int b_incr, extending, first, ret;

      *(void **)addrp = NULL;

      dbmp = dbmfp->dbmp;
      dbenv = dbmp->dbenv;

      PANIC_CHECK(dbenv);

      mp = dbmp->reginfo[0].primary;
      mfp = dbmfp->mfp;
      mf_offset = R_OFFSET(dbmp->reginfo, mfp);
      alloc_bhp = bhp = NULL;
      hp = NULL;
      b_incr = extending = ret = 0;

      /*
       * Validate arguments.
       *
       * !!!
       * Don't test for DB_MPOOL_CREATE and DB_MPOOL_NEW flags for readonly
       * files here, and create non-existent pages in readonly files if the
       * flags are set, later.  The reason is that the hash access method
       * wants to get empty pages that don't really exist in readonly files.
       * The only alternative is for hash to write the last "bucket" all the
       * time, which we don't want to do because one of our big goals in life
       * is to keep database files small.  It's sleazy as hell, but we catch
       * any attempt to actually write the file in memp_fput().
       */
#define     OKFLAGS           (DB_MPOOL_CREATE | DB_MPOOL_LAST | DB_MPOOL_NEW)
      if (flags != 0) {
            if ((ret = __db_fchk(dbenv, "memp_fget", flags, OKFLAGS)) != 0)
                  return (ret);

            switch (flags) {
            case DB_MPOOL_CREATE:
                  break;
            case DB_MPOOL_LAST:
                  /* Get the last page number in the file. */
                  if (flags == DB_MPOOL_LAST) {
                        R_LOCK(dbenv, dbmp->reginfo);
                        *pgnoaddr = mfp->last_pgno;
                        R_UNLOCK(dbenv, dbmp->reginfo);
                  }
                  break;
            case DB_MPOOL_NEW:
                  /*
                   * If always creating a page, skip the first search
                   * of the hash bucket.
                   */
                  if (flags == DB_MPOOL_NEW)
                        goto alloc;
                  break;
            default:
                  return (__db_ferr(dbenv, "memp_fget", 1));
            }
      }

      /*
       * If mmap'ing the file and the page is not past the end of the file,
       * just return a pointer.
       *
       * The page may be past the end of the file, so check the page number
       * argument against the original length of the file.  If we previously
       * returned pages past the original end of the file, last_pgno will
       * have been updated to match the "new" end of the file, and checking
       * against it would return pointers past the end of the mmap'd region.
       *
       * If another process has opened the file for writing since we mmap'd
       * it, we will start playing the game by their rules, i.e. everything
       * goes through the cache.  All pages previously returned will be safe,
       * as long as the correct locking protocol was observed.
       *
       * We don't discard the map because we don't know when all of the
       * pages will have been discarded from the process' address space.
       * It would be possible to do so by reference counting the open
       * pages from the mmap, but it's unclear to me that it's worth it.
       */
      if (dbmfp->addr != NULL &&
          F_ISSET(mfp, MP_CAN_MMAP) && *pgnoaddr <= mfp->orig_last_pgno) {
            *(void **)addrp =
                R_ADDR(dbmfp, *pgnoaddr * mfp->stat.st_pagesize);
            ++mfp->stat.st_map;
            return (0);
      }

hb_search:
      /*
       * Determine the cache and hash bucket where this page lives and get
       * local pointers to them.  Reset on each pass through this code, the
       * page number can change.
       */
      n_cache = NCACHE(mp, mf_offset, *pgnoaddr);
      c_mp = dbmp->reginfo[n_cache].primary;
      hp = R_ADDR(&dbmp->reginfo[n_cache], c_mp->htab);
      hp = &hp[NBUCKET(c_mp, mf_offset, *pgnoaddr)];

      /* Search the hash chain for the page. */
retry:      st_hsearch = 0;
      MUTEX_LOCK(dbenv, &hp->hash_mutex);
      for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
          bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) {
            ++st_hsearch;
            if (bhp->pgno != *pgnoaddr || bhp->mf_offset != mf_offset)
                  continue;

            /*
             * Increment the reference count.  We may discard the hash
             * bucket lock as we evaluate and/or read the buffer, so we
             * need to ensure it doesn't move and its contents remain
             * unchanged.
             */
            if (bhp->ref == UINT16_T_MAX) {
                  __db_err(dbenv,
                      "%s: page %lu: reference count overflow",
                      __memp_fn(dbmfp), (u_long)bhp->pgno);
                  ret = EINVAL;
                  MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
                  goto err;
            }
            ++bhp->ref;
            b_incr = 1;

            /*
             * BH_LOCKED --
             * I/O is in progress or sync is waiting on the buffer to write
             * it.  Because we've incremented the buffer reference count,
             * we know the buffer can't move.  Unlock the bucket lock, wait
             * for the buffer to become available, reacquire the bucket.
             */
            for (first = 1; F_ISSET(bhp, BH_LOCKED) &&
                !F_ISSET(dbenv, DB_ENV_NOLOCKING); first = 0) {
                  /*
                   * If someone is trying to sync this buffer and the
                   * buffer is hot, they may never get in.  Give up
                   * and try again.
                   */
                  if (!first && bhp->ref_sync != 0) {
                        --bhp->ref;
                        b_incr = 0;
                        MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
                        __os_yield(dbenv, 1);
                        goto retry;
                  }

                  MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
                  /*
                   * Explicitly yield the processor if not the first pass
                   * through this loop -- if we don't, we might run to the
                   * end of our CPU quantum as we will simply be swapping
                   * between the two locks.
                   */
                  if (!first)
                        __os_yield(dbenv, 1);

                  MUTEX_LOCK(dbenv, &bhp->mutex);
                  /* Wait for I/O to finish... */
                  MUTEX_UNLOCK(dbenv, &bhp->mutex);
                  MUTEX_LOCK(dbenv, &hp->hash_mutex);
            }

            ++mfp->stat.st_cache_hit;
            break;
      }

      /*
       * Update the hash bucket search statistics -- do now because our next
       * search may be for a different bucket.
       */
      ++c_mp->stat.st_hash_searches;
      if (st_hsearch > c_mp->stat.st_hash_longest)
            c_mp->stat.st_hash_longest = st_hsearch;
      c_mp->stat.st_hash_examined += st_hsearch;

      /*
       * There are 4 possible paths to this location:
       *
       * FIRST_MISS:
       *    Didn't find the page in the hash bucket on our first pass:
       *    bhp == NULL, alloc_bhp == NULL
       *
       * FIRST_FOUND:
       *    Found the page in the hash bucket on our first pass:
       *    bhp != NULL, alloc_bhp == NULL
       *
       * SECOND_FOUND:
       *    Didn't find the page in the hash bucket on the first pass,
       *    allocated space, and found the page in the hash bucket on
       *    our second pass:
       *    bhp != NULL, alloc_bhp != NULL
       *
       * SECOND_MISS:
       *    Didn't find the page in the hash bucket on the first pass,
       *    allocated space, and didn't find the page in the hash bucket
       *    on our second pass:
       *    bhp == NULL, alloc_bhp != NULL
       */
      state = bhp == NULL ?
          (alloc_bhp == NULL ? FIRST_MISS : SECOND_MISS) :
          (alloc_bhp == NULL ? FIRST_FOUND : SECOND_FOUND);
      switch (state) {
      case FIRST_FOUND:
            /* We found the buffer in our first check -- we're done. */
            break;
      case FIRST_MISS:
            /*
             * We didn't find the buffer in our first check.  Figure out
             * if the page exists, and allocate structures so we can add
             * the page to the buffer pool.
             */
            MUTEX_UNLOCK(dbenv, &hp->hash_mutex);

alloc:            /*
             * If DB_MPOOL_NEW is set, we have to allocate a page number.
             * If neither DB_MPOOL_CREATE or DB_MPOOL_CREATE is set, then
             * it's an error to try and get a page past the end of file.
             */
            COMPQUIET(n_cache, 0);

            extending = ret = 0;
            R_LOCK(dbenv, dbmp->reginfo);
            switch (flags) {
            case DB_MPOOL_NEW:
                  extending = 1;
                  *pgnoaddr = mfp->last_pgno + 1;
                  break;
            case DB_MPOOL_CREATE:
                  extending = *pgnoaddr > mfp->last_pgno;
                  break;
            default:
                  ret = *pgnoaddr > mfp->last_pgno ? DB_PAGE_NOTFOUND : 0;
                  break;
            }
            R_UNLOCK(dbenv, dbmp->reginfo);
            if (ret != 0)
                  goto err;

            /*
             * !!!
             * In the DB_MPOOL_NEW code path, mf_offset and n_cache have
             * not yet been initialized.
             */
            mf_offset = R_OFFSET(dbmp->reginfo, mfp);
            n_cache = NCACHE(mp, mf_offset, *pgnoaddr);

            /* Allocate a new buffer header and data space. */
            if ((ret = __memp_alloc(dbmp,
                &dbmp->reginfo[n_cache], mfp, 0, NULL, &alloc_bhp)) != 0)
                  goto err;
#ifdef DIAGNOSTIC
            if ((db_alignp_t)alloc_bhp->buf & (sizeof(size_t) - 1)) {
                  __db_err(dbenv,
                      "Error: buffer data is NOT size_t aligned");
                  ret = EINVAL;
                  goto err;
            }
#endif
            /*
             * If we are extending the file, we'll need the region lock
             * again.
             */
            if (extending)
                  R_LOCK(dbenv, dbmp->reginfo);

            /*
             * DB_MPOOL_NEW does not guarantee you a page unreferenced by
             * any other thread of control.  (That guarantee is interesting
             * for DB_MPOOL_NEW, unlike DB_MPOOL_CREATE, because the caller
             * did not specify the page number, and so, may reasonably not
             * have any way to lock the page outside of mpool.) Regardless,
             * if we allocate the page, and some other thread of control
             * requests the page by number, we will not detect that and the
             * thread of control that allocated using DB_MPOOL_NEW may not
             * have a chance to initialize the page.  (Note: we *could*
             * detect this case if we set a flag in the buffer header which
             * guaranteed that no gets of the page would succeed until the
             * reference count went to 0, that is, until the creating page
             * put the page.)  What we do guarantee is that if two threads
             * of control are both doing DB_MPOOL_NEW calls, they won't
             * collide, that is, they won't both get the same page.
             *
             * There's a possibility that another thread allocated the page
             * we were planning to allocate while we were off doing buffer
             * allocation.  We can do that by making sure the page number
             * we were going to use is still available.  If it's not, then
             * we check to see if the next available page number hashes to
             * the same mpool region as the old one -- if it does, we can
             * continue, otherwise, we have to start over.
             */
            if (flags == DB_MPOOL_NEW && *pgnoaddr != mfp->last_pgno + 1) {
                  *pgnoaddr = mfp->last_pgno + 1;
                  if (n_cache != NCACHE(mp, mf_offset, *pgnoaddr)) {
                        __db_shalloc_free(
                            dbmp->reginfo[n_cache].addr, alloc_bhp);
                        /*
                         * flags == DB_MPOOL_NEW, so extending is set
                         * and we're holding the region locked.
                         */
                        R_UNLOCK(dbenv, dbmp->reginfo);

                        alloc_bhp = NULL;
                        goto alloc;
                  }
            }

            /*
             * We released the region lock, so another thread might have
             * extended the file.  Update the last_pgno and initialize
             * the file, as necessary, if we extended the file.
             */
            if (extending) {
#ifdef HAVE_FILESYSTEM_NOTZERO
                  if (*pgnoaddr > mfp->last_pgno &&
                      __os_fs_notzero() &&
                      F_ISSET(dbmfp->fhp, DB_FH_VALID))
                        ret = __memp_fs_notzero(
                            dbenv, dbmfp, mfp, pgnoaddr);
                  else
                        ret = 0;
#endif
                  if (ret == 0 && *pgnoaddr > mfp->last_pgno)
                        mfp->last_pgno = *pgnoaddr;

                  R_UNLOCK(dbenv, dbmp->reginfo);
                  if (ret != 0)
                        goto err;
            }
            goto hb_search;
      case SECOND_FOUND:
            /*
             * We allocated buffer space for the requested page, but then
             * found the page in the buffer cache on our second check.
             * That's OK -- we can use the page we found in the pool,
             * unless DB_MPOOL_NEW is set.
             *
             * Free the allocated memory, we no longer need it.  Since we
             * can't acquire the region lock while holding the hash bucket
             * lock, we have to release the hash bucket and re-acquire it.
             * That's OK, because we have the buffer pinned down.
             */
            MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
            R_LOCK(dbenv, &dbmp->reginfo[n_cache]);
            __db_shalloc_free(dbmp->reginfo[n_cache].addr, alloc_bhp);
            alloc_bhp = NULL;
            R_UNLOCK(dbenv, &dbmp->reginfo[n_cache]);
            MUTEX_LOCK(dbenv, &hp->hash_mutex);

            /*
             * We can't use the page we found in the pool if DB_MPOOL_NEW
             * was set.  (For details, see the above comment beginning
             * "DB_MPOOL_NEW does not guarantee you a page unreferenced by
             * any other thread of control".)  If DB_MPOOL_NEW is set, we
             * release our pin on this particular buffer, and try to get
             * another one.
             */
            if (flags == DB_MPOOL_NEW) {
                  --bhp->ref;
                  b_incr = 0;
                  goto alloc;
            }
            break;
      case SECOND_MISS:
            /*
             * We allocated buffer space for the requested page, and found
             * the page still missing on our second pass through the buffer
             * cache.  Instantiate the page.
             */
            bhp = alloc_bhp;
            alloc_bhp = NULL;

            /*
             * Initialize all the BH and hash bucket fields so we can call
             * __memp_bhfree if an error occurs.
             *
             * Append the buffer to the tail of the bucket list and update
             * the hash bucket's priority.
             */
            b_incr = 1;

            memset(bhp, 0, sizeof(BH));
            bhp->ref = 1;
            bhp->priority = UINT32_T_MAX;
            bhp->pgno = *pgnoaddr;
            bhp->mf_offset = mf_offset;
            SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq);
            hp->hash_priority =
                SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority;

            /* If we extended the file, make sure the page is never lost. */
            if (extending) {
                  ++hp->hash_page_dirty;
                  F_SET(bhp, BH_DIRTY | BH_DIRTY_CREATE);
            }

            /*
             * If we created the page, zero it out.  If we didn't create
             * the page, read from the backing file.
             *
             * !!!
             * DB_MPOOL_NEW doesn't call the pgin function.
             *
             * If DB_MPOOL_CREATE is used, then the application's pgin
             * function has to be able to handle pages of 0's -- if it
             * uses DB_MPOOL_NEW, it can detect all of its page creates,
             * and not bother.
             *
             * If we're running in diagnostic mode, smash any bytes on the
             * page that are unknown quantities for the caller.
             *
             * Otherwise, read the page into memory, optionally creating it
             * if DB_MPOOL_CREATE is set.
             */
            if (extending) {
                  if (mfp->clear_len == 0)
                        memset(bhp->buf, 0, mfp->stat.st_pagesize);
                  else {
                        memset(bhp->buf, 0, mfp->clear_len);
#if defined(DIAGNOSTIC) || defined(UMRW)
                        memset(bhp->buf + mfp->clear_len, CLEAR_BYTE,
                            mfp->stat.st_pagesize - mfp->clear_len);
#endif
                  }

                  if (flags == DB_MPOOL_CREATE && mfp->ftype != 0)
                        F_SET(bhp, BH_CALLPGIN);

                  ++mfp->stat.st_page_create;
            } else {
                  F_SET(bhp, BH_TRASH);
                  ++mfp->stat.st_cache_miss;
            }

            /* Increment buffer count referenced by MPOOLFILE. */
            MUTEX_LOCK(dbenv, &mfp->mutex);
            ++mfp->block_cnt;
            MUTEX_UNLOCK(dbenv, &mfp->mutex);

            /*
             * Initialize the mutex.  This is the last initialization step,
             * because it's the only one that can fail, and everything else
             * must be set up or we can't jump to the err label because it
             * will call __memp_bhfree.
             */
            if ((ret = __db_mutex_setup(dbenv,
                &dbmp->reginfo[n_cache], &bhp->mutex, 0)) != 0)
                  goto err;
      }

      DB_ASSERT(bhp->ref != 0);

      /*
       * If we're the only reference, update buffer and bucket priorities.
       * We may be about to release the hash bucket lock, and everything
       * should be correct, first.  (We've already done this if we created
       * the buffer, so there is no need to do it again.)
       */
      if (state != SECOND_MISS && bhp->ref == 1) {
            bhp->priority = UINT32_T_MAX;
            SH_TAILQ_REMOVE(&hp->hash_bucket, bhp, hq, __bh);
            SH_TAILQ_INSERT_TAIL(&hp->hash_bucket, bhp, hq);
            hp->hash_priority =
                SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority;
      }

      /*
       * BH_TRASH --
       * The buffer we found may need to be filled from the disk.
       *
       * It's possible for the read function to fail, which means we fail as
       * well.  Note, the __memp_pgread() function discards and reacquires
       * the hash lock, so the buffer must be pinned down so that it cannot
       * move and its contents are unchanged.  Discard the buffer on failure
       * unless another thread is waiting on our I/O to complete.  It's OK to
       * leave the buffer around, as the waiting thread will see the BH_TRASH
       * flag set, and will also attempt to discard it.  If there's a waiter,
       * we need to decrement our reference count.
       */
      if (F_ISSET(bhp, BH_TRASH) &&
          (ret = __memp_pgread(dbmfp,
          &hp->hash_mutex, bhp, LF_ISSET(DB_MPOOL_CREATE) ? 1 : 0)) != 0)
            goto err;

      /*
       * BH_CALLPGIN --
       * The buffer was processed for being written to disk, and now has
       * to be re-converted for use.
       */
      if (F_ISSET(bhp, BH_CALLPGIN)) {
            if ((ret = __memp_pg(dbmfp, bhp, 1)) != 0)
                  goto err;
            F_CLR(bhp, BH_CALLPGIN);
      }

      MUTEX_UNLOCK(dbenv, &hp->hash_mutex);

#ifdef DIAGNOSTIC
      /* Update the file's pinned reference count. */
      R_LOCK(dbenv, dbmp->reginfo);
      ++dbmfp->pinref;
      R_UNLOCK(dbenv, dbmp->reginfo);

      /*
       * We want to switch threads as often as possible, and at awkward
       * times.  Yield every time we get a new page to ensure contention.
       */
      if (F_ISSET(dbenv, DB_ENV_YIELDCPU))
            __os_yield(dbenv, 1);
#endif

      *(void **)addrp = bhp->buf;
      return (0);

err:  /*
       * Discard our reference.  If we're the only reference, discard the
       * the buffer entirely.  If we held a reference to a buffer, we are
       * also still holding the hash bucket mutex.
       */
      if (b_incr) {
            if (bhp->ref == 1)
                  (void)__memp_bhfree(dbmp, hp, bhp, 1);
            else {
                  --bhp->ref;
                  MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
            }
      }

      /* If alloc_bhp is set, free the memory. */
      if (alloc_bhp != NULL)
            __db_shalloc_free(dbmp->reginfo[n_cache].addr, alloc_bhp);

      return (ret);
}

#ifdef HAVE_FILESYSTEM_NOTZERO
/*
 * __memp_fs_notzero --
 *    Initialize the underlying allocated pages in the file.
 */
static int
__memp_fs_notzero(dbenv, dbmfp, mfp, pgnoaddr)
      DB_ENV *dbenv;
      DB_MPOOLFILE *dbmfp;
      MPOOLFILE *mfp;
      db_pgno_t *pgnoaddr;
{
      DB_IO db_io;
      u_int32_t i, npages;
      size_t nw;
      int ret;
      u_int8_t *page;
      char *fail;

      /*
       * Pages allocated by writing pages past end-of-file are not zeroed,
       * on some systems.  Recovery could theoretically be fooled by a page
       * showing up that contained garbage.  In order to avoid this, we
       * have to write the pages out to disk, and flush them.  The reason
       * for the flush is because if we don't sync, the allocation of another
       * page subsequent to this one might reach the disk first, and if we
       * crashed at the right moment, leave us with this page as the one
       * allocated by writing a page past it in the file.
       *
       * Hash is the only access method that allocates groups of pages.  We
       * know that it will use the existence of the last page in a group to
       * signify that the entire group is OK; so, write all the pages but
       * the last one in the group, flush them to disk, and then write the
       * last one to disk and flush it.
       */
      if ((ret = __os_calloc(dbenv, 1, mfp->stat.st_pagesize, &page)) != 0)
            return (ret);

      db_io.fhp = dbmfp->fhp;
      db_io.mutexp = dbmfp->mutexp;
      db_io.pagesize = db_io.bytes = mfp->stat.st_pagesize;
      db_io.buf = page;

      npages = *pgnoaddr - mfp->last_pgno;
      for (i = 1; i < npages; ++i) {
            db_io.pgno = mfp->last_pgno + i;
            if ((ret = __os_io(dbenv, &db_io, DB_IO_WRITE, &nw)) != 0) {
                  fail = "write";
                  goto err;
            }
      }
      if (i != 1 && (ret = __os_fsync(dbenv, dbmfp->fhp)) != 0) {
            fail = "sync";
            goto err;
      }

      db_io.pgno = mfp->last_pgno + npages;
      if ((ret = __os_io(dbenv, &db_io, DB_IO_WRITE, &nw)) != 0) {
            fail = "write";
            goto err;
      }
      if ((ret = __os_fsync(dbenv, dbmfp->fhp)) != 0) {
            fail = "sync";
err:        __db_err(dbenv, "%s: %s failed for page %lu",
                __memp_fn(dbmfp), fail, (u_long)db_io.pgno);
      }

      __os_free(dbenv, page);
      return (ret);
}
#endif

Generated by  Doxygen 1.6.0   Back to index