Two commits from upstream..

constants: remove init/destroyROM functions
threading: use 32bit atomic integer operations exclusively

Tested on amd64/i386/sparc64 now builds on powerpc.
This commit is contained in:
brad 2014-12-08 08:25:05 +00:00
parent 8219a6bc2e
commit 32015a4e81
14 changed files with 494 additions and 5 deletions

View File

@ -1,13 +1,10 @@
# $OpenBSD: Makefile,v 1.5 2014/11/02 11:36:48 brad Exp $
# $OpenBSD: Makefile,v 1.6 2014/12/08 08:25:05 brad Exp $
COMMENT= free H.265/HEVC encoder
BROKEN-hppa = undefined reference to `__sync_val_compare_and_swap_8'
BROKEN-mips64 = undefined reference to `__sync_val_compare_and_swap_8'
BROKEN-mips64el = undefined reference to `__sync_val_compare_and_swap_8'
BROKEN-powerpc = undefined reference to `__sync_val_compare_and_swap_8'
VER= 1.4
DISTNAME= x265-${VER}
REVISION= 0
DISTFILES= ${DISTNAME}{${VER}}${EXTRACT_SUFX}
CATEGORIES= multimedia
MASTER_SITES= https://bitbucket.org/multicoreware/x265/get/

View File

@ -0,0 +1,29 @@
$OpenBSD: patch-source_common_constants_cpp,v 1.1 2014/12/08 08:25:05 brad Exp $
constants: remove init/destroyROM functions
--- source/common/constants.cpp.orig Sun Dec 7 00:35:41 2014
+++ source/common/constants.cpp Sun Dec 7 00:41:06 2014
@@ -27,22 +27,6 @@
namespace x265 {
-static int initialized /* = 0 */;
-
-// initialize ROM variables
-void initROM()
-{
- if (ATOMIC_CAS32(&initialized, 0, 1) == 1)
- return;
-}
-
-void destroyROM()
-{
- if (ATOMIC_CAS32(&initialized, 1, 0) == 0)
- return;
-}
-
-
// lambda = pow(2, (double)q / 6 - 2);
double x265_lambda_tab[QP_MAX_MAX + 1] =
{

View File

@ -0,0 +1,16 @@
$OpenBSD: patch-source_common_constants_h,v 1.1 2014/12/08 08:25:05 brad Exp $
constants: remove init/destroyROM functions
--- source/common/constants.h.orig Sun Dec 7 00:40:08 2014
+++ source/common/constants.h Sun Dec 7 00:40:23 2014
@@ -29,9 +29,6 @@
namespace x265 {
// private namespace
-void initROM();
-void destroyROM();
-
void initZscanToRaster(uint32_t maxFullDepth, uint32_t depth, uint32_t startVal, uint32_t*& curIdx);
void initRasterToZscan(uint32_t maxFullDepth);

View File

@ -0,0 +1,15 @@
$OpenBSD: patch-source_common_param_cpp,v 1.1 2014/12/08 08:25:05 brad Exp $
threading: use 32bit atomic integer operations exclusively
--- source/common/param.cpp.orig Fri Oct 31 14:24:01 2014
+++ source/common/param.cpp Sun Dec 7 00:05:35 2014
@@ -1061,7 +1061,7 @@ int x265_set_globals(x265_param *param)
{
static int once /* = 0 */;
- if (ATOMIC_CAS32(&once, 0, 1) == 1)
+ if (ATOMIC_INC(&once) > 1)
{
if (param->maxCUSize != g_maxCUSize)
{

View File

@ -0,0 +1,15 @@
$OpenBSD: patch-source_common_primitives_cpp,v 1.1 2014/12/08 08:25:05 brad Exp $
constants: remove init/destroyROM functions
--- source/common/primitives.cpp.orig Sun Dec 7 00:44:34 2014
+++ source/common/primitives.cpp Sun Dec 7 00:44:44 2014
@@ -132,8 +132,6 @@ void x265_setup_primitives(x265_param *param, int cpui
#endif
Setup_Alias_Primitives(primitives);
-
- initROM();
}
if (param->logLevel >= X265_LOG_INFO)

View File

@ -0,0 +1,24 @@
$OpenBSD: patch-source_common_quant_cpp,v 1.1 2014/12/08 08:25:05 brad Exp $
threading: use 32bit atomic integer operations exclusively
--- source/common/quant.cpp.orig Fri Oct 31 14:24:01 2014
+++ source/common/quant.cpp Sun Dec 7 00:05:35 2014
@@ -81,7 +81,7 @@ inline int getICRate(uint32_t absLevel, int32_t diffLe
// NOTE: mapping to x86 hardware instruction BSR
unsigned long size;
- CLZ32(size, absLevel);
+ CLZ(size, absLevel);
int egs = size * 2 + 1;
rate += egs << 15;
@@ -135,7 +135,7 @@ inline uint32_t getICRateCost(uint32_t absLevel, int32
if (symbol)
{
unsigned long idx;
- CLZ32(idx, symbol + 1);
+ CLZ(idx, symbol + 1);
length = idx;
}

View File

@ -0,0 +1,43 @@
$OpenBSD: patch-source_common_threading_h,v 1.1 2014/12/08 08:25:05 brad Exp $
threading: use 32bit atomic integer operations exclusively
--- source/common/threading.h.orig Fri Oct 31 14:24:01 2014
+++ source/common/threading.h Sun Dec 7 00:05:35 2014
@@ -49,11 +49,10 @@
#include <sys/time.h>
#include <unistd.h>
-#define CLZ32(id, x) id = (unsigned long)__builtin_clz(x) ^ 31
-#define CTZ64(id, x) id = (unsigned long)__builtin_ctzll(x)
-#define ATOMIC_OR(ptr, mask) __sync_or_and_fetch(ptr, mask)
-#define ATOMIC_CAS(ptr, oldval, newval) __sync_val_compare_and_swap(ptr, oldval, newval)
-#define ATOMIC_CAS32(ptr, oldval, newval) __sync_val_compare_and_swap(ptr, oldval, newval)
+#define CLZ(id, x) id = (unsigned long)__builtin_clz(x) ^ 31
+#define CTZ(id, x) id = (unsigned long)__builtin_ctz(x)
+#define ATOMIC_OR(ptr, mask) __sync_fetch_and_or(ptr, mask)
+#define ATOMIC_AND(ptr, mask) __sync_fetch_and_and(ptr, mask)
#define ATOMIC_INC(ptr) __sync_add_and_fetch((volatile int32_t*)ptr, 1)
#define ATOMIC_DEC(ptr) __sync_add_and_fetch((volatile int32_t*)ptr, -1)
#define GIVE_UP_TIME() usleep(0)
@@ -99,16 +98,12 @@ inline int _BitScanForward64(DWORD *id, uint64_t x64)
#endif // if !_WIN64
-#ifndef ATOMIC_OR
-#define ATOMIC_OR(ptr, mask) InterlockedOr64((volatile LONG64*)ptr, mask)
-#endif
-
-#define CLZ32(id, x) _BitScanReverse(&id, x)
-#define CTZ64(id, x) _BitScanForward64(&id, x)
-#define ATOMIC_CAS(ptr, oldval, newval) (uint64_t)_InterlockedCompareExchange64((volatile LONG64*)ptr, newval, oldval)
-#define ATOMIC_CAS32(ptr, oldval, newval) (uint64_t)_InterlockedCompareExchange((volatile LONG*)ptr, newval, oldval)
+#define CLZ(id, x) _BitScanReverse(&id, x)
+#define CTZ(id, x) _BitScanForward(&id, x)
#define ATOMIC_INC(ptr) InterlockedIncrement((volatile LONG*)ptr)
#define ATOMIC_DEC(ptr) InterlockedDecrement((volatile LONG*)ptr)
+#define ATOMIC_OR(ptr, mask) InterlockedOr((volatile LONG*)ptr, (LONG)mask)
+#define ATOMIC_AND(ptr, mask) InterlockedAnd((volatile LONG*)ptr, (LONG)mask)
#define GIVE_UP_TIME() Sleep(0)
#endif // ifdef __GNUC__

View File

@ -0,0 +1,104 @@
$OpenBSD: patch-source_common_threadpool_cpp,v 1.1 2014/12/08 08:25:05 brad Exp $
threading: use 32bit atomic integer operations exclusively
--- source/common/threadpool.cpp.orig Fri Oct 31 14:24:01 2014
+++ source/common/threadpool.cpp Sun Dec 7 00:05:35 2014
@@ -87,7 +87,7 @@ class ThreadPoolImpl : public ThreadPool (private)
int m_numThreads;
int m_numSleepMapWords;
PoolThread *m_threads;
- volatile uint64_t *m_sleepMap;
+ volatile uint32_t *m_sleepMap;
/* Lock for write access to the provider lists. Threads are
* always allowed to read m_firstProvider and follow the
@@ -174,8 +174,8 @@ void PoolThread::threadMain()
void ThreadPoolImpl::markThreadAsleep(int id)
{
- int word = id >> 6;
- uint64_t bit = 1LL << (id & 63);
+ int word = id >> 5;
+ uint32_t bit = 1 << (id & 31);
ATOMIC_OR(&m_sleepMap[word], bit);
}
@@ -186,16 +186,16 @@ void ThreadPoolImpl::pokeIdleThread()
* not give up until a thread is awakened or all of them are awake */
for (int i = 0; i < m_numSleepMapWords; i++)
{
- uint64_t oldval = m_sleepMap[i];
+ uint32_t oldval = m_sleepMap[i];
while (oldval)
{
unsigned long id;
- CTZ64(id, oldval);
+ CTZ(id, oldval);
- uint64_t newval = oldval & ~(1LL << id);
- if (ATOMIC_CAS(&m_sleepMap[i], oldval, newval) == oldval)
+ uint32_t bit = 1 << id;
+ if (ATOMIC_AND(&m_sleepMap[i], ~bit) & bit)
{
- m_threads[(i << 6) | id].poke();
+ m_threads[i * 32 + id].poke();
return;
}
@@ -249,8 +249,8 @@ ThreadPoolImpl::ThreadPoolImpl(int numThreads)
, m_firstProvider(NULL)
, m_lastProvider(NULL)
{
- m_numSleepMapWords = (numThreads + 63) >> 6;
- m_sleepMap = X265_MALLOC(uint64_t, m_numSleepMapWords);
+ m_numSleepMapWords = (numThreads + 31) >> 5;
+ m_sleepMap = X265_MALLOC(uint32_t, m_numSleepMapWords);
char *buffer = (char*)X265_MALLOC(PoolThread, numThreads);
m_threads = reinterpret_cast<PoolThread*>(buffer);
@@ -259,9 +259,7 @@ ThreadPoolImpl::ThreadPoolImpl(int numThreads)
if (m_threads && m_sleepMap)
{
for (int i = 0; i < m_numSleepMapWords; i++)
- {
m_sleepMap[i] = 0;
- }
m_ok = true;
int i;
@@ -277,9 +275,7 @@ ThreadPoolImpl::ThreadPoolImpl(int numThreads)
}
if (m_ok)
- {
waitForAllIdle();
- }
else
{
// stop threads that did start up
@@ -300,12 +296,10 @@ void ThreadPoolImpl::waitForAllIdle()
int id = 0;
do
{
- int word = id >> 6;
- uint64_t bit = 1LL << (id & 63);
+ int word = id >> 5;
+ uint32_t bit = 1 << (id & 31);
if (m_sleepMap[word] & bit)
- {
id++;
- }
else
{
GIVE_UP_TIME();
@@ -338,9 +332,7 @@ ThreadPoolImpl::~ThreadPoolImpl()
{
// cleanup thread handles
for (int i = 0; i < m_numThreads; i++)
- {
m_threads[i].~PoolThread();
- }
X265_FREE(reinterpret_cast<char*>(m_threads));
}

View File

@ -0,0 +1,125 @@
$OpenBSD: patch-source_common_wavefront_cpp,v 1.1 2014/12/08 08:25:05 brad Exp $
threading: use 32bit atomic integer operations exclusively
--- source/common/wavefront.cpp.orig Fri Oct 31 14:24:01 2014
+++ source/common/wavefront.cpp Sun Dec 7 00:05:35 2014
@@ -33,14 +33,14 @@ bool WaveFront::init(int numRows)
{
m_numRows = numRows;
- m_numWords = (numRows + 63) >> 6;
- m_internalDependencyBitmap = X265_MALLOC(uint64_t, m_numWords);
+ m_numWords = (numRows + 31) >> 5;
+ m_internalDependencyBitmap = X265_MALLOC(uint32_t, m_numWords);
if (m_internalDependencyBitmap)
- memset((void*)m_internalDependencyBitmap, 0, sizeof(uint64_t) * m_numWords);
+ memset((void*)m_internalDependencyBitmap, 0, sizeof(uint32_t) * m_numWords);
- m_externalDependencyBitmap = X265_MALLOC(uint64_t, m_numWords);
+ m_externalDependencyBitmap = X265_MALLOC(uint32_t, m_numWords);
if (m_externalDependencyBitmap)
- memset((void*)m_externalDependencyBitmap, 0, sizeof(uint64_t) * m_numWords);
+ memset((void*)m_externalDependencyBitmap, 0, sizeof(uint32_t) * m_numWords);
return m_internalDependencyBitmap && m_externalDependencyBitmap;
}
@@ -53,58 +53,31 @@ WaveFront::~WaveFront()
void WaveFront::clearEnabledRowMask()
{
- memset((void*)m_externalDependencyBitmap, 0, sizeof(uint64_t) * m_numWords);
+ memset((void*)m_externalDependencyBitmap, 0, sizeof(uint32_t) * m_numWords);
}
void WaveFront::enqueueRow(int row)
{
- // thread safe
- uint64_t bit = 1LL << (row & 63);
-
- X265_CHECK(row < m_numRows, "invalid row\n");
- ATOMIC_OR(&m_internalDependencyBitmap[row >> 6], bit);
+ uint32_t bit = 1 << (row & 31);
+ ATOMIC_OR(&m_internalDependencyBitmap[row >> 5], bit);
if (m_pool) m_pool->pokeIdleThread();
}
void WaveFront::enableRow(int row)
{
- // thread safe
- uint64_t bit = 1LL << (row & 63);
-
- X265_CHECK(row < m_numRows, "invalid row\n");
- ATOMIC_OR(&m_externalDependencyBitmap[row >> 6], bit);
+ uint32_t bit = 1 << (row & 31);
+ ATOMIC_OR(&m_externalDependencyBitmap[row >> 5], bit);
}
void WaveFront::enableAllRows()
{
- memset((void*)m_externalDependencyBitmap, ~0, sizeof(uint64_t) * m_numWords);
+ memset((void*)m_externalDependencyBitmap, ~0, sizeof(uint32_t) * m_numWords);
}
-bool WaveFront::checkHigherPriorityRow(int curRow)
-{
- int fullwords = curRow >> 6;
- uint64_t mask = (1LL << (curRow & 63)) - 1;
-
- // Check full bitmap words before curRow
- for (int i = 0; i < fullwords; i++)
- {
- if (m_internalDependencyBitmap[i] & m_externalDependencyBitmap[i])
- return true;
- }
-
- // check the partially masked bitmap word of curRow
- if (m_internalDependencyBitmap[fullwords] & m_externalDependencyBitmap[fullwords] & mask)
- return true;
- return false;
-}
-
bool WaveFront::dequeueRow(int row)
{
- uint64_t oldval, newval;
-
- oldval = m_internalDependencyBitmap[row >> 6];
- newval = oldval & ~(1LL << (row & 63));
- return ATOMIC_CAS(&m_internalDependencyBitmap[row >> 6], oldval, newval) == oldval;
+ uint32_t bit = 1 << (row & 31);
+ return ATOMIC_AND(&m_internalDependencyBitmap[row >> 5], ~bit) & bit;
}
bool WaveFront::findJob(int threadId)
@@ -114,22 +87,21 @@ bool WaveFront::findJob(int threadId)
// thread safe
for (int w = 0; w < m_numWords; w++)
{
- uint64_t oldval = m_internalDependencyBitmap[w];
- while (oldval & m_externalDependencyBitmap[w])
+ uint32_t oldval = m_internalDependencyBitmap[w] & m_externalDependencyBitmap[w];
+ while (oldval)
{
- uint64_t mask = oldval & m_externalDependencyBitmap[w];
+ CTZ(id, oldval);
- CTZ64(id, mask);
-
- uint64_t newval = oldval & ~(1LL << id);
- if (ATOMIC_CAS(&m_internalDependencyBitmap[w], oldval, newval) == oldval)
+ uint32_t bit = 1 << id;
+ if (ATOMIC_AND(&m_internalDependencyBitmap[w], ~bit) & bit)
{
- // we cleared the bit, process row
- processRow(w * 64 + id, threadId);
+ /* we cleared the bit, we get to process the row */
+ processRow(w * 32 + id, threadId);
return true;
}
+
// some other thread cleared the bit, try another bit
- oldval = m_internalDependencyBitmap[w];
+ oldval = m_internalDependencyBitmap[w] & m_externalDependencyBitmap[w];
}
}

View File

@ -0,0 +1,28 @@
$OpenBSD: patch-source_common_wavefront_h,v 1.1 2014/12/08 08:25:05 brad Exp $
threading: use 32bit atomic integer operations exclusively
--- source/common/wavefront.h.orig Fri Oct 31 14:24:01 2014
+++ source/common/wavefront.h Sun Dec 7 00:05:35 2014
@@ -43,8 +43,8 @@ class WaveFront : public JobProvider (private)
// Dependencies are categorized as internal and external. Internal dependencies
// are caused by neighbor block availability. External dependencies are generally
// reference frame reconstructed pixels being available.
- uint64_t volatile *m_internalDependencyBitmap;
- uint64_t volatile *m_externalDependencyBitmap;
+ uint32_t volatile *m_internalDependencyBitmap;
+ uint32_t volatile *m_externalDependencyBitmap;
// number of words in the bitmap
int m_numWords;
@@ -92,10 +92,6 @@ class WaveFront : public JobProvider (private)
// Start or resume encode processing of this row, must be implemented by
// derived classes.
virtual void processRow(int row, int threadId) = 0;
-
- // Returns true if a row above curRow is available for processing. The processRow()
- // method may call this function periodically and voluntarily exit
- bool checkHigherPriorityRow(int curRow);
};
} // end namespace x265

View File

@ -0,0 +1,37 @@
$OpenBSD: patch-source_common_winxp_h,v 1.1 2014/12/08 08:25:05 brad Exp $
threading: use 32bit atomic integer operations exclusively
--- source/common/winxp.h.orig Fri Oct 31 14:24:01 2014
+++ source/common/winxp.h Sun Dec 7 00:05:35 2014
@@ -56,30 +56,6 @@ void cond_destroy(ConditionVariable *cond);
#define WakeAllConditionVariable x265::cond_broadcast
#define XP_CONDITION_VAR_FREE x265::cond_destroy
-#if defined(_MSC_VER)
-
-/* Windows XP did not define atomic OR 64, but gcc has a good version, so
- * only use this workaround when targeting XP with MSVC */
-FORCEINLINE LONGLONG interlocked_OR64(__inout LONGLONG volatile *Destination,
- __in LONGLONG Value)
-{
- LONGLONG Old;
-
- do
- {
- Old = *Destination;
- }
- while (_InterlockedCompareExchange64(Destination, Old | Value, Old) != Old);
-
- return Old;
-}
-
-#define ATOMIC_OR(ptr, mask) x265::interlocked_OR64((volatile LONG64*)ptr, mask)
-
-#if defined(_MSC_VER) && !defined(__INTEL_COMPILER)
-#pragma intrinsic(_InterlockedCompareExchange64)
-#endif
-#endif // defined(_MSC_VER)
} // namespace x265
#else // if defined(_WIN32) && (_WIN32_WINNT < 0x0600)

View File

@ -0,0 +1,14 @@
$OpenBSD: patch-source_encoder_api_cpp,v 1.1 2014/12/08 08:25:05 brad Exp $
constants: remove init/destroyROM functions
--- source/encoder/api.cpp.orig Sun Dec 7 00:43:39 2014
+++ source/encoder/api.cpp Sun Dec 7 00:43:58 2014
@@ -178,7 +178,6 @@ void x265_encoder_close(x265_encoder *enc)
extern "C"
void x265_cleanup(void)
{
- destroyROM();
BitCost::destroy();
}

View File

@ -0,0 +1,27 @@
$OpenBSD: patch-source_encoder_entropy_cpp,v 1.1 2014/12/08 08:25:05 brad Exp $
threading: use 32bit atomic integer operations exclusively
--- source/encoder/entropy.cpp.orig Fri Oct 31 14:24:01 2014
+++ source/encoder/entropy.cpp Sun Dec 7 00:05:35 2014
@@ -1116,7 +1116,7 @@ void Entropy::writeCoefRemainExGolomb(uint32_t codeNum
if (codeNumber != 0)
{
unsigned long idx;
- CLZ32(idx, codeNumber + 1);
+ CLZ(idx, codeNumber + 1);
length = idx;
codeNumber -= (1 << idx) - 1;
}
@@ -2006,9 +2006,9 @@ void Entropy::encodeBin(uint32_t binValue, uint8_t &ct
if ((binValue ^ mstate) & 1)
{
// NOTE: lps is non-zero and the maximum of idx is 8 because lps less than 256
- //numBits = g_renormTable[lps >> 3];
+ //numBits = g_renormTable[lps >> 3];
unsigned long idx;
- CLZ32(idx, lps);
+ CLZ(idx, lps);
X265_CHECK(state != 63 || idx == 1, "state failure\n");
numBits = 8 - idx;

View File

@ -0,0 +1,15 @@
$OpenBSD: patch-source_encoder_slicetype_cpp,v 1.1 2014/12/08 08:25:05 brad Exp $
threading: use 32bit atomic integer operations exclusively
--- source/encoder/slicetype.cpp.orig Fri Oct 31 14:24:01 2014
+++ source/encoder/slicetype.cpp Sun Dec 7 00:05:35 2014
@@ -192,7 +192,7 @@ Frame* Lookahead::getDecidedPicture()
/* Called by pool worker threads */
bool Lookahead::findJob(int)
{
- if (m_bReady && ATOMIC_CAS32(&m_bReady, 1, 0) == 1)
+ if (m_bReady > 0 && ATOMIC_DEC(&m_bReady) == 0)
{
m_inputQueueLock.acquire();
slicetypeDecide();