Count ones in binary representation

The task is to find ones in the binary representation of a number in given range. For an example: 1 is represented a 1 in binary so the number is 1’s in 1 is 1. Another example: 4 is represented as 100 in binary so number of 1’s in binary representation of 4 is 1.

In order to find 1’s in a very large range between 1 and 10,000,000 sounds complicated but can solved with following algorithm

#include <iostream>
#include <algorithm>
#include <cmath>

long long ccb(int col, int n) {
  double division = (n + 1) / double(2 << col);
  int sum = static_cast<int>(division);
  long long rest = std::max(0.0, (division - sum) * (2 << col) - (1 << col));
  return sum * (1 << col) + rest;

long long cab(int n) {
  long long bits = 0;
  for (int i = 0; i < 30; i++) {
  bits += ccb(i, n);
 return bits;

long long countOnes(int left, int right) {
 return cab(right) - cab(left - 1);

Multiply large numbers as strings

This algorithm is very much like factorial. Multiplication of very large numbers represented as strings because they go out of the integer range.

#include <iostream>
#include <vector>
#include <string>
#include <sstream>

std::string multiply(std::string a, std::string b) {
  // take a vector of length 200 and initialize all of its elements to 0.
  std::vector<int> multiplication(200, 0); 

 int place = 0;
 for(int i = a.size() - 1 ; i >= 0; i--){
   int carry = 0;
   std::vector<int>::iterator it = multiplication.begin() + place;
   for(int j = b.size() - 1 ; j >= 0; j--){
     int a_element = a[i] - '0';
     int b_element = b[j] - '0';
     int prod = a_element*b_element + carry + *it;
     *it = prod % 10;
     carry = prod / 10;
     while (carry){
       int current_carry = carry + *it;
       *it += current_carry % 10;
       carry = current_carry / 10;

std::stringstream ss;
for(int i = multiplication.size(); i >= 0; i--){
 ss << multiplication[i];
 return ss.str();

Reverse a number

Algorithm for reversing the number is as follows:

#include <cmath> // for finding out the length of number
// find the length of the number to find out the highest power of 10
// then divide the number by 10 and take remainder to find digit 
// at the unit's place.
// multiply the digit with power of 10. repeat length times.
int reverse(int x) {
  int length = (int)log10(abs(x)); // magic formula to find length
  int reversed = 0;
  int remainder = 0;
  for(int i = length; i > 0 ; --i){
    remainder = x % 10;
    x = x / 10;
    reversed += remainder * pow(10, i);
  reversed += x;
  return reversed;


Factorial of a large number

Factorial of an integer is found by multiplying all the numbers starting from 1 up to the number. So factorial for 5 is 5! = 5 x 4 x 3 x 2 x 1 = 120.

A recursive algorithm for finding factorial is given as

int factorial(int num){
  return num > 1 ? : num * factorial(num - 1) : 1;

The issue is factorial quickly grows out of the range of int/double/float limit.


factorial of 100 93326215443944152681699238856266700490715968264381621468592963895217599993229915608941463976156518286253697920827223758251185210916864000000000000000000000000

factorial of 1000 402387260077093773543702433923003985719374864210714632543799910429938512398629020592044208486969404800479988610197196058631666872994808558901323829669944590997424504087073759918823627727188732519779505950995276120874975462497043601418278094646496291056393887437886487337119181045825783647849977012476632889835955735432513185323958463075557409114262417474349347553428646576611667797396668820291207379143853719588249808126867838374559731746136085379534524221586593201928090878297308431392844403281231558611036976801357304216168747609675871348312025478589320767169132448426236131412508780208000261683151027341827977704784635868170164365024153691398281264810213092761244896359928705114964975419909342221566832572080821333186116811553615836546984046708975602900950537616475847728421889679646244945160765353408198901385442487984959953319101723355556602139450399736280750137837615307127761926849034352625200015888535147331611702103968175921510907788019393178114194545257223865541461062892187960223838971476088506276862967146674697562911234082439208160153780889893964518263243671616762179168909779911903754031274622289988005195444414282012187361745992642956581746628302955570299024324153181617210465832036786906117260158783520751516284225540265170483304226143974286933061690897968482590125458327168226458066526769958652682272807075781391858178889652208164348344825993266043367660176999612831860788386150279465955131156552036093988180612138558600301435694527224206344631797460594682573103790084024432438465657245014402821885252470935190620929023136493273497565513958720559654228749774011413346962715422845862377387538230483865688976461927383814900140767310446640259899490222221765904339901886018566526485061799702356193897017860040811889729918311021171229845901641921068884387121855646124960798722908519296819372388642614839657382291123125024186649353143970137428531926649875337218940694281434118520158014123344828015051399694290153483077644569099073152433278288269864602789864321139083506217095002597389863554277196742822248757586765752344220207573630569498825087968928162753848863396909959826280956121450994871701244516461260379029309120889086942028510640182154399457156805941872748998094254742173582401063677404595741785160829230135358081840096996372524230560855903700624271243416909004153690105933983835777939410970027753472000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000

Following is program I wrote to find factorials and store them as strings instead of numbers.

#include <iostream> // for cout 
#include <vector> // used to store the numbers
#include <string> 
#include <sstream>

// method to multiply each number passed with each element 
// within the vector. If there is a carry, just add it to the next
// number. 
void multiply_vector(std::vector<int> & v, int multiply_num){
  int carry = 0;
  for(size_t i = 0; i < v.size(); i++){
    int prod = v[i] * multiply_num + carry;
    v[i] = prod % 10;
   carry = prod / 10;

 while (carry)
   int current_carry = carry % 10;
   carry = carry / 10;

// instead of recursion use loop and vector to store result 
// of multiplication
std::string factorial(int num){
   std::vector<int> factorial_digits;
   for(int i = 2; i <= num; i++){
     multiply_vector(factorial_digits, i);

// received the result in vector but need to reverse it..
  std::stringstream ss;
  for(int i = factorial_digits.size()-1; i >= 0; i--){
    ss << factorial_digits[i];
  return ss.str();

int main(){
  // find factorial of 1000
  std::cout << factorial(1000) << '\n';
  return 0;


Quick Concepts Part 1 – Introduction to RDMA


This is the first post in a three post series on getting started. By the end of the getting started series my goal is to get you ready to start coding with a sample program that will demonstrate the performance benefits of RDMA. In this post I will review RDMA concepts in a simple minded way. Later I intend to come back and do more detailed posts on each concept.

What is RDMA

Image: Simple Minded View of RDMA

RDMA is Remote Dynamic Memory Access which is a way of moving buffers between two applications across a network. RDMA differs from traditional network interfaces because it bypasses the operating system. This allows programs that implement RDMA to have:

  1. The absolute lowest latency
  2. The highest throughput
  3. Smallest CPU footprint

How Can We Use It

To make use of RDMA we need to have a network interface card that implements an RDMA engine.

We call this an HCA…

View original post 912 more words

Pair in array with target sum

Given a number target and an array of numbers nums find the index of numbers which has sum equal to target.

Algorithm is:

  1. Iterate over the array
  2. subtract the first element from target, find the difference in the array
  3. Get index of current element and difference if found within the array.

c++ solution

#include <algorithm> // for std::distance and std::find
#include <vector>

std::vector<int> two_sum(std::vector<int>& nums, int target) {
  std::ptrdiff_t first_idx = 0, second_idx = 0;
  for(size_t i = 0; i <= nums.size(); i++){
    int difference = target - nums[i];
    second_idx = std::distance(nums.begin(), std::find(nums.begin()+i+1,
     nums.end(), difference));
     if(second_idx < nums.size()){
       first_idx = i;
   return {static_cast<int>(first_idx), static_cast<int>(second_idx)};

Ruby solution

def two_sums(nums, target)
  first_idx, second_idx = 0, 0
  nums.each_with_index do |num, idx|
    difference = target - num
    second_idx = nums.rindex(difference)
    first_idx = idx unless second_idx.nil?
  [first_idx, second_idx]

Atomics & lock-free data structures c++

  • The modern microprocessor pipeline is 14 stages deep during which the programming instructions reordered all the times for optimization purposes.
  • Linux 2.6 supports full preemption, i.e. the thread can be suspended in between.
  • Atomic types are the locations in main memory to which the access is exclusive to a thread/process.
  • Barriers are used for ordering the accesses to the memory locations.
  • Atomic operations are provided at hardware level in order to make the operations indivisible.
  • The Implementation is highly dependent upon the hardware. x86 has strictest rules around memory ordering.
  • Atomic operations with memory fences prevents reordering of the instructions in order to make the operation indivisible.
  • Atomic operations are expensive because the OS and hardware can not do all the necessary optimizations.

<atomic> header provides various atomic types. Following is non-exhaustive list of atomic types

atomic_bool std::atomic<bool>
atomic_char std::atomic<char>
atomic_schar std::atomic<signed char>
atomic_uchar std::atomic<unsigned char>
atomic_int std::atomic<int>
atomic_uint std::atomic<unsigned>
atomic_short std::atomic<short>
atomic_ushort std::atomic<unsigned short>
atomic_long std::atomic<long>
atomic_ulong std::atomic<unsigned long>
atomic_llong std::atomic<long long>
atomic_ullong std::atomic<unsigned long long>
atomic_char16_t std::atomic<char16_t>
atomic_char32_t std::atomic<char32_t>
atomic_wchar_t std::atomic<wchar_t>

Operations on Atomic types

These operations take a argument for memory order. Which can be one of std::memory_order_relaxed, std::memory_order_acquire,  std::memory_order_release, std::memory_order_acq_rel,  std::memory_order_consume or  std::memory_order_seq_cst

  • load: this is read operation on an atomic type.
  • store: this is a write operation on an atomic type.
  • exchange: this is read-modify-write operation on an atomic type. All the compare operations are used as compare_exchange(expected, desired, <optional memory order>). On successful exchange it return true else it returns false.
    • compare_exchange
    • compare_exchange_weak: these are really for the architectures where the read-modify-write operation is not guaranteed atomic. It can generate spurious errors and it is advised to use in a loop. It has same effect as of compare_exchange_strong on x86 platform.
    • compare_exchange_strong: it is guaranteed to return false on failure and guaranteed to return true on success.
  • fetch_ versions of add, or etc
  • overriden operators like +=, -=, *=, |=

Lock Based implementation of a Multi-producer, Multi-consumer Queue.


It is important to see the lock based data structures before implementing lock-free data structures.

Code for Lock based queue.
std::queue is wrapped in a struct Queue. 
internal_queue variable maintains the queue.
Work to do is an instance of struct Work.
Queue has pop and push functions.
Push function takes the work instance by rvalue instance and 
pushes it onto the internal queue.
Pop function returns a shared pointer to Work. The shared pointer
 is used to avoid the ABA problem.
Pop function is a non blocking function.

If the queue is empty, the thread just yields.
A sleep is added in order to simulate slowness in the work..

Disadvantages of this kind of queue:
 * Whole Queue is locked during any operations in order to make 
sure there is synchronized access.

#include <queue> // std::queue
#include <mutex> // for locking
#include <thread>
#include <memory> // smart pointers
#include <utility> 
#include <functional> // passing and storing lambda 
#include <iostream> // for std::cout
#include <chrono>

static long work_number = 0;

struct Work{
 // take lambda as a work and call lambda to process.
  Work(std::function<void(void)> lambda):f(lambda){};
  void operator()(){
  std::function<void(void)> f; 

struct Queue{
  Queue() = default;
  // Queue is not copyable or copy constructible.
  Queue(const Queue &) = delete;
  Queue & operator=(const Queue &) = delete;
  std::shared_ptr<Work> pop();
  void push(Work &&);
  std::mutex mtx;
  std::queue<Work> internal_queue;

 void Queue::push(Work && work){
  std::lock_guard<std::mutex> lg(mtx);

 std::shared_ptr<Work> Queue::pop(){
  Work w([](){});
    std::lock_guard<std::mutex> lg(mtx);
    if(internal_queue.size() > 0){
      w = std::move(internal_queue.front());
     std::this_thread::yield(); // let the other threads work
 return std::make_shared<Work>(w);

struct Producer{
 Producer(Queue & q):q(q){};
 void produce(){
     Work w([&](){std::cout << "work number : " << ++work_number << " is called.." << "\n";});
 Queue & q;

struct Consumer{
  Consumer(Queue & q):q(q){};
   void consume(){
     std::shared_ptr<Work> w_ptr = q.pop();
     Work * w = w_ptr.get();
 Queue & q;

int main(){
 Queue q;
 std::thread producer_thread1([&q](){
   Producer p(q);

std::thread producer_thread2([&q](){
  Producer p(q);

std::thread consumer_thread1([&q](){
 Consumer c(q);

std::thread consumer_thread2([&q](){
 Consumer c(q);

std::thread consumer_thread3([&q](){
 Consumer c(q);

 return 0;

Lock Free Data Structures

In the above implementation, the data structure is a lock based one and hence no two threads can access is concurrently.