PolarDBforPostgreSQL/external/polar_vfs/polar_directio.c

368 lines
9.3 KiB
C

/*-------------------------------------------------------------------------
*
* polar_directio.c
*
* Copyright (c) 2020, Alibaba Group Holding Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* IDENTIFICATION
* external/polar_vfs/polar_directio.c
*
*-------------------------------------------------------------------------
*/
#include <unistd.h>
#include <sys/time.h>
#include "postgres.h"
#include "port.h"
#include "funcapi.h"
#include "storage/backendid.h"
#include "storage/ipc.h"
#include "storage/pg_shmem.h"
#include "storage/shmem.h"
#include "storage/lwlock.h"
#include "utils/guc.h"
/* POLAR */
#include "storage/polar_fd.h"
#include "storage/polar_directio.h"
/* POLAR END */
int polar_max_direct_io_size = POLAR_DIRECTIO_DEFAULT_IOSIZE;
char *polar_directio_buffer = NULL;
/*
* POLAR: open file with PG_O_DIRECT flag.
* O_WRONLY is conflict to PG_O_DIRECT, so it needs to be
* replaced to O_RDWR. Besides, directory can not be open
* with PG_O_DIRECT.
*/
int
polar_directio_open(const char *path, int flags, mode_t mode)
{
struct stat st;
int ret = 0;
Assert(path != NULL);
/* POLAR: Check for path not exists or points to a regular file. */
ret = stat(path, &st);
if ((ret == 0 && S_ISREG(st.st_mode)) ||
(ret != 0 && errno == ENOENT))
{
if ((flags & POLAR_ACCESS_MODE_MASK) == O_WRONLY)
flags = (flags & (~POLAR_ACCESS_MODE_MASK)) | O_RDWR;
flags |= PG_O_DIRECT;
elog(DEBUG1, "polar_directio_open file with PG_O_DIRECT: %s", path);
}
else if (ret != 0)
return ret;
return open(path, flags, mode);
}
ssize_t
polar_directio_write(int fd, const void *buf, size_t len)
{
ssize_t res = -1;
off_t offset = lseek(fd, (off_t)0, SEEK_CUR);
if (offset < 0)
return res;
if (POLAR_DIECRTIO_IS_ALIGNED(buf) &&
POLAR_DIECRTIO_IS_ALIGNED(len) &&
POLAR_DIECRTIO_IS_ALIGNED(offset))
return write(fd, buf, len);
res = polar_directio_pwrite(fd, buf, len, offset);
if (res > 0 &&
lseek(fd, (off_t)(offset + res), SEEK_SET) < 0)
res = -1;
return res;
}
ssize_t
polar_directio_read(int fd, void *buf, size_t len)
{
ssize_t res = -1;
off_t offset = lseek(fd, (off_t)0, SEEK_CUR);
if (offset < 0)
return res;
if (POLAR_DIECRTIO_IS_ALIGNED(buf) &&
POLAR_DIECRTIO_IS_ALIGNED(len) &&
POLAR_DIECRTIO_IS_ALIGNED(offset))
return read(fd, buf, len);
res = polar_directio_pread(fd, buf, len, offset);
if (res > 0 &&
lseek(fd, (off_t)(offset + res), SEEK_SET) < 0)
res = -1;
return res;
}
/*
* POLAR: It do pread work with PG_O_DIRECT flag.
* First, malloc one new buffer which be aligned address.
* Second, pread content of file for aligned length from aligned offset.
* Third, copy content of file from new buffer to old buffer.
*
* We split the [offset, offset + len] into three sections: the first section,
* the middle sections and the last section. The boundary of content to be read
* will be processing during in the first and last section.
*/
ssize_t
polar_directio_pread(int fd, void *buffer, size_t len, off_t offset)
{
char *buf = polar_directio_buffer;
ssize_t res = -1;
char *from;
off_t head_start;
off_t head_end;
off_t tail_start;
off_t tail_end;
ssize_t count;
off_t off;
off_t nleft;
ssize_t cplen;
if (POLAR_DIECRTIO_IS_ALIGNED(buffer) &&
POLAR_DIECRTIO_IS_ALIGNED(len) &&
POLAR_DIECRTIO_IS_ALIGNED(offset))
return pread(fd, buffer, len, offset);
from = (char *) buffer;
head_start = POLAR_DIRECTIO_ALIGN_DOWN(offset);
head_end = POLAR_DIRECTIO_ALIGN(offset);
tail_start = POLAR_DIRECTIO_ALIGN_DOWN(offset + len);
tail_end = POLAR_DIRECTIO_ALIGN(offset + len);
count = 0;
nleft = len;
if (buf == NULL)
return res;
/* read from the first section */
if (head_start < head_end &&
nleft > 0)
{
off = head_start;
res = pread(fd, buf, POLAR_DIRECTIO_ALIGN_LEN, off);
if (res < 0)
return res;
else if (res <= (offset & (POLAR_DIRECTIO_ALIGN_LEN - 1)))
return count;
else
{
cplen = Min(res - (offset & (POLAR_DIRECTIO_ALIGN_LEN - 1)), len);
cplen = Min(nleft, cplen);
}
memcpy(from, buf + (offset & (POLAR_DIRECTIO_ALIGN_LEN - 1)), cplen);
from += cplen;
count += cplen;
nleft -= cplen;
}
/* read from the middle sections */
if (head_end < tail_start &&
nleft > 0)
{
off = head_end;
while (off < tail_start)
{
cplen = Min(tail_start - off, polar_max_direct_io_size);
res = pread(fd, buf, cplen, off);
if (res < 0)
return res;
res = Min(nleft, res);
memcpy(from, buf, res);
from += res;
count += res;
nleft -= res;
if (res < cplen)
break;
off += res;
}
}
/* read from the last section */
if (head_end <= tail_start &&
tail_start < tail_end &&
nleft > 0)
{
off = tail_start;
res = pread(fd, buf, POLAR_DIRECTIO_ALIGN_LEN, off);
if (res < 0)
return res;
else
{
cplen = Min(res, ((offset + len) & (POLAR_DIRECTIO_ALIGN_LEN - 1)));
cplen = Min(nleft, cplen);
}
memcpy(from, buf, cplen);
from += cplen;
count += cplen;
nleft -= cplen;
}
return count;
}
/*
* POLAR: It do pwrite work with PG_O_DIRECT flag.
* First, malloc one new buffer which be aligned address.
* Second, pread missing content for aligned length from aligned offset.
* Third, copy content of old buffer into new buffer and pwrite new
* buffer into file.
*
* We split the [offset, offset + len] into three sections: the first section,
* the middle sections and the last section. The boundary of content to be write
* will be processing during in the first and last section.
*
* The pwrite with PG_O_DIRECT will extend file size to mutiples of 4096 which
* means the final file's size could be bigger than expected! But we don't
* want that! Because some functions will use file's size in other way, such as
* twophase transaction's function ReadTwoPhaseFile. So, we truncate file to
* expected size in this case.
*/
ssize_t
polar_directio_pwrite(int fd, const void *buffer, size_t len, off_t offset)
{
#define POLAR_DIRECTIO_PWRITE_SECTION(start, len) \
do \
{ \
MemSet(buf, 0x0, POLAR_DIRECTIO_ALIGN_LEN); \
res = pread(fd, buf, POLAR_DIRECTIO_ALIGN_LEN, off); \
if (res < 0) \
return res; \
memcpy(buf + start, from, len); \
res = pwrite(fd, buf, POLAR_DIRECTIO_ALIGN_LEN, off); \
if (res < 0) \
return res; \
Assert(res == POLAR_DIRECTIO_ALIGN_LEN); \
from += len; \
count += len; \
nleft -= len; \
} while(0)
char *buf = polar_directio_buffer;
ssize_t res = -1;
char *from;
off_t head_start;
off_t head_end;
off_t tail_start;
off_t tail_end;
ssize_t count;
off_t off;
off_t nleft;
ssize_t cplen;
bool need_truncate = false;
struct stat stat_buf;
if (POLAR_DIECRTIO_IS_ALIGNED(buffer) &&
POLAR_DIECRTIO_IS_ALIGNED(len) &&
POLAR_DIECRTIO_IS_ALIGNED(offset))
return pwrite(fd, buffer, len, offset);
from = (char *) buffer;
head_start = POLAR_DIRECTIO_ALIGN_DOWN(offset);
head_end = POLAR_DIRECTIO_ALIGN(offset);
tail_start = POLAR_DIRECTIO_ALIGN_DOWN(offset + len);
tail_end = POLAR_DIRECTIO_ALIGN(offset + len);
count = 0;
nleft = len;
if (buf == NULL)
return res;
/*
* POLAR: Whether we should truncate file to expected size or not.
* stat_buf constains the original file's states including size.
*/
if (!POLAR_DIECRTIO_IS_ALIGNED(offset + len))
{
res = fstat(fd, &stat_buf);
if (res < 0)
return res;
if (stat_buf.st_size < tail_end)
need_truncate = true;
}
/* write the first section */
if (head_start < head_end &&
nleft > 0)
{
off = head_start;
cplen = Min(nleft, POLAR_DIRECTIO_ALIGN_LEN - (offset & (POLAR_DIRECTIO_ALIGN_LEN - 1)));
POLAR_DIRECTIO_PWRITE_SECTION((offset & (POLAR_DIRECTIO_ALIGN_LEN - 1)), cplen);
}
/* write the middle sections */
if (head_end < tail_start &&
nleft > 0)
{
off = head_end;
while (off < tail_start)
{
cplen = Min(tail_start - off, polar_max_direct_io_size);
memcpy(buf, from, cplen);
res = pwrite(fd, buf, cplen, off);
if (res < 0)
return res;
Assert(res == cplen);
from += res;
count += res;
nleft -= res;
off += res;
}
}
/* write the last section */
if (head_end <= tail_start &&
tail_start < tail_end &&
nleft > 0)
{
off = tail_start;
cplen = Min(nleft, (offset + len) & (POLAR_DIRECTIO_ALIGN_LEN - 1));
POLAR_DIRECTIO_PWRITE_SECTION(0, cplen);
}
Assert(nleft == 0);
/* If we should truncate file to true size */
if (need_truncate)
{
res = ftruncate(fd, Max(offset + len, stat_buf.st_size));
if (res < 0)
return res;
}
return count;
}