Yekun's Note

Machine learning notes and writeup.

Fork me on GitHub

Tricks of Python

Useful usage of python.

Advanced Trick

Implement Trie (Prefix Tree)

  • defaultdict
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
from collections import defaultdict
from functools import reduce
TrieNode = lambda: defaultdict(TrieNode)

class Trie:
def __init__(self):
self.trie = TrieNode()

def insert(self, word):
reduce(dict.__getitem__, word, self.trie)['end'] = True

def search(self, word):
return reduce(lambda d,k: d[k] if k in d else TrieNode(), word, self.trie).get('end', False)

def startsWith(self, word):
return bool(reduce(lambda d,k: d[k] if k in d else TrieNode(), word, self.trie).keys())
  • dict
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
class TrieNode(object):
def __init__(self):
"""
Initialize your data structure here.
"""
self.data = {}
self.is_word = False

class Trie(object):
def __init__(self):
self.root = TrieNode()

def insert(self, word):
"""
Inserts a word into the trie.
:type word: str
:rtype: void
"""
node = self.root
for letter in word:
child = node.data.get(letter)
if not child:
node.data[letter] = TrieNode()
node = node.data[letter]
node.is_word = True

def search(self, word):
"""
Returns if the word is in the trie.
:type word: str
:rtype: bool
"""
node = self.root
for letter in word:
node = node.data.get(letter)
if not node:
return False
return node.is_word

def starts_with(self, prefix):
"""
Returns if there is any word in the trie
that starts with the given prefix.
:type prefix: str
:rtype: bool
"""
node = self.root
for letter in prefix:
node = node.data.get(letter)
if not node:
return False
return True

def get_start(self, prefix):
"""
Returns words started with prefix
:param prefix:
:return: words (list)
"""
def _get_key(pre, pre_node):
words_list = []
if pre_node.is_word:
words_list.append(pre)
for x in pre_node.data.keys():
words_list.extend(_get_key(pre + str(x), pre_node.data.get(x)))
return words_list

words = []
if not self.starts_with(prefix):
return words
if self.search(prefix):
words.append(prefix)
return words
node = self.root
for letter in prefix:
node = node.data.get(letter)
return _get_key(prefix, node)

2D list to 1D

1
2
3
matrix = [[1,5,9],[10,11,13],[12,13,15]]
sum(matrix, [])
# [1, 5, 9, 10, 11, 13, 12, 13, 15]

Variable

Swap value

1
2
# python way to swap values
a, b = b, a

chain comparison with ops

1
2
b=6
1==b<38 # False

Statement

For else

1
2
3
4
5
6
l = list(range(5))
for e in l:
if e == 0:
break
else: # called if for loop does not reach break statement
print('did not break out of for loop')

List

list to str

1
2
3
4
5
l = ["Python", "is", "awesome"]
" ".join(l)

l = list(range(5))
",".join(map(str, l))

2d array transpose

1
2
arr = [['a','b'], ['c','d'],['e','f']]
transposed = list(zip(*arr))

most frequent element in a list

1
2
3
4
5
6
7
a = [1,2,3,1,2,3,2,2,4,5,1]
max(set(a), key=a.count)

""" use Counter from collections"""
from collections import Counter
cnt = Counter(a)
cnt.most_common(3)

copy array

1
2
3
4
5
6
7
8
9
# a fast way of shallow copy of a list
b = a[:]

b = a.copy() # python3 only

# copy nested lists using copy.deepcopy
import copy
l = [[1,2], [3,4]]
l2 = copy.deepcopy(l)

Remove duplicates in list

1
2
3
4
5
6
7
items = [2,2,2,3,3,4,1]
list(set(items))

""" remove dups and keep order """
items = ['foo', 'bar', 'bar', 'foo']
from collections import OrderedDict
list(OrderedDict.fromkeys(items).keys())

Find index of Min/Max

1
2
3
4
5
6
7
8
9
10
11
l = list(range(5))

def minIndex(lst):
return min(range(len(lst)), key=lst.__getitem__)


def maxIndex(lst):
return max(range(len(lst)), key=lst.__getitem__)

minIndex(l)
maxIndex(l)

Str

check if strs are permutation

1
2
from collections import Counter
Counter(str1) == Counter(str2)

Reverse string

1
2
3
4
a ='abcdefghijklmnopqrstuvwxyz'
a[::-1]

"".join([ch for ch in reversed(a)])

find index

1
2
3
# check the index of a char, else -1
a ='abcdefghijklmnopqrstuvwxyz'
a.find(a) # 0

Dict

Dict get

1
2
d = {'a': 1, 'b': 2}
d.get('c', 3)

sort dict

1
2
3
4
5
6
sorted(d.items(), key=lambda x:x[1])

from operator import itemgetter
sorted(d.items(), key=itemgetter(1))

sorted(d, key=d.get)

merge dict

1
2
3
4
5
6
7
8
d1 = {'a': 1}
d2 = {'b': 2}

{**d1, **d2}

dict(d1.items() | d2.items())

d1.update(d2)

**kwargs

dictionary.pop(keyname, defaultvalue)

1
2
3
4
def fn(*args, **kwargs):
v1 = kwargs.pop("k1", 'default1')
v2 = kwargs.pop("k2", 'default2')
v3 = kwargs.pop("k3", 'default3')

Builtin functions

Binary, Octal, Hexadecimal integers

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
# base 10 (decimal)
dec = 5

"""
base 2 (binary)
"""
# decimal to binary (base 2)
bin_dec = bin(dec) # str: '0b101'
# binary to decimal
dec = int(bin_dec, 2) # int: 5

"""
base 8 (octal)
"""
# decimal to octal (base 8)
oct_dec = oct(dec) # str: '05'
# octal to decimal
dec = int(oct_dec, 8) # int: 5

"""
base 16 (hexadecimal)
"""
# decimal to hexadecimal (base 16)
hex_dec = hex(dec) # str: '0x5'
# hexadecimal to decimal
dec = int(hex_dec, 16) # int: 5

Bit-wise Op

  • &: binary AND. Op copies a bit to the result if it exists in both operands.
  • |: binary OR. It copies a bit to the result if it exists in either operand.
  • ^: binary XOR. It copies a bit if it is set in one operand but not both.
  • ~: binary Ones Complement. It is unary and has the effect of ‘flipping’ bits.

  • <<: binary left shift.

  • >>: binary right shift.

Array index

1
2
~i # -i-1
arr[~i] # is equal to arr[-i-1]

\r application

  • \r puts the location of the cursor at the beginning of the current row.
  • \b backtracks one bit of the cursor.

Count down

1
2
3
4
5
6
7
8
9
import time

count_down = 10 # count down total time ( seconds )
for i in range(count_down, 0, -1):
msg = u"\r System will exit in " + str(i) + "secs"
print(msg, end="")
time.sleep(1)
end_msg = "end" + " "*(len(msg)-len("end")) # cover all the console output with whitespace
print(u"\r"+end_msg)

Progress bar

1
2
3
4
5
6
7
8
9
import time

count_down = 10
interval = 1
for i in range(0, int(count_down/interval)+1):
print("\r"+"▇"*i+" "+str(i*10)+"%", end="")
time.sleep(interval)
print("\n Finished")

Loading

1
2
3
4
5
6
7
8
9
10
11
import time

count_down = 10
interval = 0.25
for i in range(0, int(count_down/interval)):
ch_list = ["\\", "|", "/", "-"]
index = i % 4
msg = "\r Program is running " + ch_list[index]
print(msg, end="")
time.sleep(interval)
print(u"\r Finished" + " "*len(msg))

Functional programming

Advanced function

map

1
2
3
4
5
def f(x):
return x*x

r = map(f, [1,2,3,4]) # => Iterator
list(r) # [1,4,9,16]

reduce

1
reduce(f, [x1,x2,x3]) <=> f(f(x1, x2), x3) 
  • Application: str2int
    1
    2
    3
    4
    5
    6
    7
    8
    9
    from functools import reduce

    DIGITS = {'0':0, '1':1, '2':2, '3':3, '4':4, '5':5, '6':6, '7':7, '8':8, '9':9, '0':0}

    def char2num(s):
    return DIGIT[s]

    def str2int(s):
    return reduce(lambda x,y: x*10+y, map(char2sum, s))

filter

filter filters out sequence by applying on each element and selecting by return value (True means keep that element)

  • Filter out even numbers:

    1
    2
    3
    4
    5
    def is_odd(n):
    return n%2 == 1

    list(filter(is_odd, [1,2,3,4,5]))
    # [1,3,5]
  • Get the prime number

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    def _odd_iter():
    n=1
    while True:
    n += 2
    yield n

    def _not_divisible(n):
    return lambda x: x%n > 0

    def primes():
    yield 2
    it = _odd_iter()
       while True:
    n = next(it)
           yield n
           it = filter(_not_divisible(n), it)

    for n in primes():
    if n<1000:
    print(n)
    else:
    break

sorted

1
2
3
sorted(['bob', 'about', 'Zoo', 'Credit'], key=str.lower, reverse=True)
# ['Zoo', 'Credit', 'bob', 'about']

Decorator

  • Print the call time of a function
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    import time

    def time_metric(func):
    # print the call time and call counts
    count = 0
    def wrapper(*args, **kwargs):
    nonlocal count
    start_time = time.time()
    data = func(*args, **kwargs)
    time_delta = time.time() - start_time
    count += 1
           print("Call {} times. It costs {} secs".format(count, time_delta))
    return data
    return wrapper

    @time_metric
    def f():
    time.sleep(0.5)

Deep copy and shallow copy

  • = operator only creates a new variable that shares the reference of the original object.
  • Deep copy is a process in which the copy process occurs recursively. It first construct a new collection object and then recursively populating it with copies of the child objects found in the original. Any changes made to a copy of object do not reflect in the original object.
    1
    2
    3
    4
    5
    6
    7
    import copy

    l = [1,2,[3,4],5]
    l_deep = copy.deepcopy(l) # deep copy
    l[2][0] = 6
    print(l) # [1,2,[6,4],5]
    print(l_deep) # [1,2,[3,4],5]
  • Shallow copy constructs a new collection object and then populating it with references to the children objects found in the original. The copying process does not recurse and therefore won’t create copies of the child objects themselves.

upload successful

1
2
3
4
5
6
7
import copy

l = [1,2,[3,4],5]
l_deep = copy.copy(l) # shallow copy
l[2][0] = 6
print(l) # [1,2,[6,4],5]
print(l_deep) # [1,2,[6,4],5]

multiprocess v.s. threading

multiprocessing

Pros:

  • Separate memory space
  • Takes advantages of multiple CPUs and cores
  • Avoids GIL(Global Interpreter Lock) limitations for cPython
  • Children processes are interruptible
  • A must with cPython for CPU-bound processing

Cons:

  • IPC a little more complicated with more overhead
  • Larger memory footprint

threading

Pros:

  • Lightweight -> low memory footprint
  • Shared memory -> makes access to state from another context easier
  • Allows you to easily make responsive UIs
  • cPython C extension modules that properly release the GIL will run in parallel
  • Great option for I/O-bound applications

Cons:

  • CPython -> subject to the GIL
  • Not interruptible/killable

Random

1
2
3
4
5
6
7
8
9
10
11
12
13
# approach 1
import numpy as np
np.random.choice(items, p)

# approach 2
def random_select(items, p):
x = random.uniform(0,1)
cum_p = .0
for item, prob in zip(items, p):
cum_p += prob
if x < cum_p:
break
return item

Pathlib

The python std library Pathlib module can be used to manipulate windows paths on a Unix machine(or vice versa). It can also be used to manipulate paths without accessing the os module.
upload successful

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
>>> from pathlib import Path
# listing subdirectories
>>> p = Path('.')
>>> [x for x in p.iterdir() if x.is_dir()]

# listing py src files in the directory tree
>>> list(p.glob('**/*.py'))

# navigate inside a directory tree
>>> p = Path('/etc')
>>> q = p / 'init.d' / 'reboot'
>>> q
PosixPath('/etc/init.d/reboot')

# query path properties
>>> q.exists()
True
>>> q.is_dir()
False

# open a file
>>> with q.open() as f: f.readline()

>>> p.name # filename
>>> p.stem # filename without suffix
>>> p.suffix # file suffix
>>> p.parent # the dirname

# create directory
>>> p=Path(r'./a/b/c')
>>> p.mkdir(exist_ok=True) # create if the parent directory exists
## iteratively create the directory
>>> p.mkdir(exist_ok=True, parents=True)

# file details
>>> p.stat() # detail information
>>> p.stat().st_size # file size
>>> p.stat().st_ctime # create time
>>> p.stat().st_mtime # modified time

itertools

itertools.count()

Iteration over infinite numbers

1
2
3
4
5
6
$ import itertools
$ results = []
$ for x in itertools.count(start=0):
$ if <condition>:
$ break
$ results.append(<element>)

itertools.permutations()

itertools.permutations(iterable, r=None)

  • Return successive $r$ length permutations of elements in the iterable. If None, then $r$ defaults to the length of the iterable and alll possible full-length permutations.
    1
    2
    # permutations('ABCD', 2) --> AB AC AD BA BC BD CA CB CD DA DB DC
    # permutations(range(3)) --> 012 021 102 120 201 210

itertools.zip_longest()

1
2
# Similar to zip but return the longest length of inputs with filling default values, whereas zip returns the shortest length
itertools.zip_longest(*iterables, fillvalue=None)

File

os

1
2
3
4
5
filepath='../test/test/test.txt'
# split filename
os.path.splittext(filepath) # ('../test/test', 'test.txt')
# split extention name
os.path.splitext(filepath) # ('../test/test/test', '.txt')

glob

1
2
# search for files
glob.glob('dir/subdir/*.txt') # return *.txt in the directory 'dir/subdir/'

My utils

echo with colors

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26

def print_format_table():
"""
prints table of formatted text format options
"""
for style in range(8):
for fg in range(30, 38):
s1 = ''
for bg in range(40, 48):
format = ';'.join([str(style), str(fg), str(bg)])
s1 += '\x1b[%sm %s \x1b[0m' % (format, format)
print(s1)
print('\n')


def cyk_print(content, font_color='green', bg_color=None, display='highlight'):
assert font_color or bg_color or display is not None
display_map = {'default': 0, "highlight": 1, "underline": 4, "flicker": 5, "reverse": 7, "invisible": 8}
font_map = {'black': 30, 'red': 31, 'green': 32, 'yellow': 33, 'blue': 34, 'purple': 35, 'yank': 36,
'white': 37}
bg_map = {'black': 40, 'red': 41, 'green': 42, 'yellow': 43, 'blue': 44, 'purple': 45, 'yank': 46,
'white': 47}
pp = "\033[{}{}{}m{}\033[0m".format(display_map.get(display, 'highlight'),
";{}".format(font_map.get(font_color, 30)),
";{}".format(bg_map.get(bg_color, '')), content)
print(pp)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
def printer(f):
@wraps(f)
def wrapped(*args, **kwargs):
print('-' * 60)
print("Start to run \033[1;36m{}\033[0m ...".format(f.__name__))
t1 = time.time()
r = f(*args, **kwargs)
print('the result is \033[1;31m{:.4f}\033[0m'.format(r), end=', ')
print("time cost: {:.5g} \033[1msecs\033[0m".format(time.time() - t1))
# return r
return wrapped

@printer
def func(...):
pass

if __name__ == '__main__':
func()

Errors

Default argument is mutable

When we assign the default argument values, avoid using mutable object, like list(), dict().

1
2
3
4
5
6
7
8
9
def error_f(item, my_list=[]):
my_list.append(item)
print(my_list)

error_f('a') ['a']

error_f('b') # ['a', 'b']

error_f('c') # ['a', 'b', 'c']
  • Python’s default arguments are evaluated once when the function is defined, not each time the function is called (like it is in say, Ruby). This means that if you use a mutable default argument and mutate it, you will and have mutated that object for all future calls to the function as well.
  • Important warning: The default value is evaluated only once. This makes a difference when the default is a mutable object such as a list, dictionary, or instances of most classes.

WTF

Deleting a list item while iterating

Never change the object you’re iterating over!

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
l1 = [1,2,3,4]
l2 = [1,2,3,4]
l3 = [1,2,3,4]
l4 = [1,2,3,4]

for idx, item in enumerate(l1):
del item

for idx, item in enumerate(l2):
l2.remove(item)

# the correct way to change the object you iterate over
for idx, item in enumerate(l3[:]):
l3.remove(item)

for idx, item in enumerate(l4):
l4.pop(idx)

1
2
3
4
5
6
7
8
>>> l1
[1, 2, 3, 4]
>>> l2
[2, 4]
>>> l3
[]
>>> l4
[2, 4]

Difference between del, remove and pop:

  • del var_name just removes the binding of the var_name form the local/global namespace
  • remove removes the fist matching value (raise ValueError if the value is not found)
  • pop removes the element at a specific index and returns it (raise IndexError if an invalid index)

Why the output is [2,4] ? [2]

  • The list iteration is done index by index. When removing 1 from l1 or l4, the contents of lists are [2,3,4]. The remaining elements are shifted down, i.e. 2 is at index 0, 3 at index 1. Since the next iteration is going to loo at index 1 (element 3), the 2 is skipped entirely.

Run

stderr directly outputs to the screen, whilst the stdout is usually saved in the buffer util full.
python -u can directly print out the stdout without buffers.

1
2
# unbuffered
python -u <xxx>.py

References