diff --git a/course_tools/README.md b/course_tools/README.md new file mode 100644 index 0000000..839e1e8 --- /dev/null +++ b/course_tools/README.md @@ -0,0 +1,20 @@ +# CSE 320 Fall 2020 Course Tools + + +Run the script to install the 320 course tools and packages. +``` +$ bash vm-setup.sh +``` + +The script will ask for sudo privileges to install packages. + +**NOTE THIS TOOL RUNS APT UPGRADE** + +Once the script has been run, read the comments at the end of the script +in order to install packages into the Sublime Text editor. The commented +commands in the script are an old version of an automatic installation +procedure, which doesn't work any more. What you should do instead is to +first manually install "Package Control", then "SublimeLinter", "SublimeLinter-gcc", +and "TrailingSpaces". Then, copy file `SublimeLinter.sublime-settings` to +directory `~/.config/sublime-text-3/Packages/User` as described in the +comments. diff --git a/course_tools/SublimeLinter.sublime-settings b/course_tools/SublimeLinter.sublime-settings new file mode 100644 index 0000000..acc61c5 --- /dev/null +++ b/course_tools/SublimeLinter.sublime-settings @@ -0,0 +1,15 @@ +// SublimeLinter Settings - User +{ + "delay":0.25, + "linters":{ + "gcc":{ + "disable":false, + "c_executable":"gcc", + "args":["-Wall"], + "I":[ + "${file_path}/../include", + "${project_path}/include" + ] + } + } +} diff --git a/course_tools/boxfort-commit-ac0507b b/course_tools/boxfort-commit-ac0507b new file mode 100644 index 0000000..be43870 --- /dev/null +++ b/course_tools/boxfort-commit-ac0507b @@ -0,0 +1,132 @@ +From ac0507b3f45fe58100b528baeb8ca04270b4a8ff Mon Sep 17 00:00:00 2001 +From: "Franklin \"Snaipe\" Mathieu" +Date: Mon, 23 Mar 2020 05:52:23 +0000 +Subject: timeout-posix: fix race condition + +The posix timeout code was racy -- if a timeout was created, and +cancelled before the watchdog had any chance to run (because the worker +would exit too quickly, or because the thread would not be scheduled +quickly enough). This, in turn, made the watchdog wait forever for the +timeout queue to be nonempty. + +This fixes the race by preventing the watchdog from ever waiting for the +queue to fill up -- it's actually not possible for the queue to be +empty during initialization, because the watchdog thread will be made to +wait for the initialization lock to be released. This means that the +only time where the queue is empty is when the watchdog has been +started, but the worker already exited/the timeout was cancelled. + +In addition, this fix simplifies slightly the way that the watchdog is +collected -- we no longer try to join the thread, but we make it +detached from the get go. + +This addresses Snaipe/Criterion#345. + +diff --git a/src/timeout-posix.c b/src/timeout-posix.c +index 53bd181..2e9a210 100644 +--- a/src/timeout-posix.c ++++ b/src/timeout-posix.c +@@ -22,13 +22,13 @@ + * THE SOFTWARE. + */ + #include ++#include + #include +-#include ++#include + #include + #include +-#include +-#include + #include ++#include + + #include "config.h" + #include "sandbox.h" +@@ -48,11 +48,9 @@ static struct { + int thread_active; + pthread_mutex_t sync; + pthread_cond_t cond; +- pthread_cond_t termcond; + } self = { + .sync = PTHREAD_MUTEX_INITIALIZER, + .cond = PTHREAD_COND_INITIALIZER, +- .termcond = PTHREAD_COND_INITIALIZER, + }; + + static int timespec_cmp(struct timespec *a, struct timespec *b) +@@ -96,8 +94,6 @@ static void to_timespec(double timeout, struct timespec *timeo) + static void *timeout_killer_fn(void *nil) + { + pthread_mutex_lock(&self.sync); +- while (!self.requests) +- pthread_cond_wait(&self.cond, &self.sync); + + struct bxfi_timeout_request *req; + for (;;) { +@@ -125,7 +121,7 @@ static void *timeout_killer_fn(void *nil) + free(req); + } + end: +- pthread_cond_broadcast(&self.termcond); ++ self.thread_active = 0; + pthread_mutex_unlock(&self.sync); + return nil; + } +@@ -137,10 +133,6 @@ void bxfi_reset_timeout_killer(void) + + memcpy(&self.sync, &mutex, sizeof (mutex)); + memcpy(&self.cond, &cond, sizeof (cond)); +- memcpy(&self.termcond, &cond, sizeof (cond)); +- +- if (self.requests) +- pthread_join(self.thread, NULL); + } + + int bxfi_push_timeout(struct bxfi_sandbox *instance, double timeout) +@@ -159,10 +151,16 @@ int bxfi_push_timeout(struct bxfi_sandbox *instance, double timeout) + + pthread_mutex_lock(&self.sync); + if (!self.requests) { +- if (self.thread_active) +- pthread_join(self.thread, NULL); ++ pthread_attr_t attrs; ++ if ((rc = pthread_attr_init(&attrs)) == -1) { ++ rc = -errno; ++ goto error; ++ } ++ pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED); ++ + self.thread_active = 1; +- rc = -pthread_create(&self.thread, NULL, timeout_killer_fn, NULL); ++ rc = -pthread_create(&self.thread, &attrs, timeout_killer_fn, NULL); ++ pthread_attr_destroy(&attrs); + if (rc) + goto error; + } +@@ -177,7 +175,6 @@ int bxfi_push_timeout(struct bxfi_sandbox *instance, double timeout) + *nptr = req; + + pthread_cond_broadcast(&self.cond); +- pthread_cond_broadcast(&self.termcond); + pthread_mutex_unlock(&self.sync); + return 0; + +@@ -204,17 +201,6 @@ void bxfi_cancel_timeout(struct bxfi_sandbox *instance) + } + if (cancelled) { + pthread_cond_broadcast(&self.cond); +- if (!self.requests) { +- while (self.cancelled && !self.requests) +- pthread_cond_wait(&self.termcond, &self.sync); +- if (self.requests) +- goto end; +- if (self.thread_active) { +- pthread_join(self.thread, NULL); +- self.thread_active = 0; +- } +- } + } +-end: + pthread_mutex_unlock(&self.sync); + } diff --git a/course_tools/criterion.zip b/course_tools/criterion.zip new file mode 100644 index 0000000..0b09380 Binary files /dev/null and b/course_tools/criterion.zip differ diff --git a/course_tools/git-submit b/course_tools/git-submit new file mode 100755 index 0000000..c09ecef --- /dev/null +++ b/course_tools/git-submit @@ -0,0 +1,231 @@ +#!/usr/bin/env python3 +import requests +from requests import get +from requests.exceptions import ConnectTimeout, ConnectionError, HTTPError +from requests.packages.urllib3.exceptions import InsecureRequestWarning +from datetime import datetime +from signal import signal, SIGINT, SIGTSTP, SIG_IGN +from subprocess import run, PIPE +from shlex import split +from argparse import ArgumentParser +from sys import exit + +RED = '\033[1;31m' +GRE = '\033[1;32m' +YEL = '\033[0;33m' +BLU = '\033[1;34m' +PUR = '\033[1;35m' +CYA = '\033[1;36m' +BOLD = '\033[1m' +DEF = '\033[0m' + +BASE_URL = 'https://cse320.starkeffect.com:3000/submit' +server_timeout = None + + +def yn_prompt(): + prompt = input('{}Continue? Please enter Y/n{}: '.format(PUR, DEF)) + while prompt != 'Y' and prompt.lower() != 'n': + prompt = input('{}Continue? Please enter Y/n{}: '.format(PUR, DEF)) + if prompt != 'Y': + print('{}Aborting...{}'.format(YEL, DEF)) + exit(1) + + +def first_api_call(): + global server_timeout + try: + res = get(BASE_URL + '/timeout', params=None, timeout=15, verify=False) + res_json = res.json() + server_timeout = res_json['timeout'] + except (ConnectionError, ConnectTimeout, HTTPError): + print('Error connecting to the server. Try again. (FIRST_SERVER_CALL)') + exit(1) + + +def initial_message(): + print('Only commits made from master or merged into master will be ', + 'considered. If you would like to submit a commit from a different ', + 'branch, please merge the branch into master first.') + yn_prompt() + + +def api_call(endpoint, query): + global server_timeout + try: + res = get(BASE_URL + endpoint, params=query, timeout=server_timeout, + verify=False) + except (ConnectionError, ConnectTimeout, HTTPError): + print('Error connecting to the server. Try again. ', + '(SERVER_CONNECTION_ERROR)') + exit(1) + res_json = res.json() + if 'errmsg' in res_json: + print(res_json['errmsg']) + exit(1) + return res_json + + +def validate_git_directory(): + cmd = split('git remote show origin') + proc = run(cmd, stdout=PIPE) + if proc.returncode: + exit(proc.returncode) + # You are not expected to understand this + return [line for line in proc.stdout.decode().split('\n') if + line.strip().startswith('Push')][0].split('/')[-1].split('.')[0] + + +def validate_commit(commit): + # check current user branch + branch_cmd = split('git rev-parse --abbrev-ref HEAD') + branch_proc = run(branch_cmd, stdout=PIPE, stderr=PIPE) + if branch_proc.returncode: + print('{}Error{}: Uninitialized repository. Please initialize your repo.'.format(RED, DEF)) + exit(branch_proc.returncode) + current_branch = [line for line in branch_proc.stdout.decode().split('\n')][0] + if not current_branch == 'master': + print('{}Error{}: Run git submit from the master branch'.format(RED, DEF)) + exit(1) + # check branch commit was made from + if not commit == 'master': + if len(commit) < 8: + print('{}Error{}: Use at least the first 8 characters of the commit hash.'.format(RED, DEF)) + exit(1) + commit_branch_cmd = split('git branch --contains {}'.format(commit)) + commit_branch_proc = run(commit_branch_cmd, stdout=PIPE, stderr=PIPE) + if commit_branch_proc.returncode: + print('{}Error{}: The specified commit hash is invalid.'.format(RED, DEF)) + exit(commit_branch_proc.returncode) + branches = [branch for branch in commit_branch_proc.stdout.decode().split('\n')] + on_master = False + for branch in branches: + branch_name = [word.strip() for word in branch.split('*')][-1] + if branch_name == 'master': + on_master = True + if not on_master: + print('{}Error{}: The submitted commit should be from master. Merge the branch with the commit into master.'.format(RED, DEF)) + exit(1) + cmd = "bash -c 'git log --branches=master* --not --remotes --pretty=oneline'" + proc = run(cmd, stdout=PIPE, shell=True) + if proc.returncode: + print(proc.stdout) + exit(proc.returncode) + elif commit == 'master' and proc.stdout: + print('{}Error{}: Push your commits to the remote using: git push'.format(RED, DEF)) + exit(1) + elif commit in proc.stdout.decode(): + print('{}Error{}: Push your commits to the remote using: git push'.format(RED, DEF)) + exit(1) + + +def validate_tag(tag): + response = api_call('/verify/{}'.format(tag), None) + if 'err' in response: + print('{}Error{}: {}'.format(RED, DEF, response['err'])) + exit(1) + if not response['valid']: + print('{}Error{}: {}'.format(RED, DEF, response['msg'])) + exit(1) + +def check_for_resubmission(args): + status = api_call('/status', args) + print('{}Submission Time{}: {}'.format(BLU, DEF, status['time'])) + if status['submitted']: + print('{}Resubmitting Homework{}: {} -- Are you {}sure{} you wish to resubmit ' + '(this may result in lateday penalties)?'.format(YEL, DEF, CYA + args['tag'] + DEF, BOLD, DEF)) + else: + print('{}Submitting Homework{}: {} -- Are you {}sure{}? ' + .format(YEL, DEF, CYA + args['tag'] + DEF, BOLD, DEF)) + yn_prompt() + return status['submitted'], status['time'] + + +def confirm_repo_state(): + run(split('git -c color.status=always status')) + print('{}Current Repo State{}: Are you {}sure{} you have committed and pushed all the files needed ' + '(such as .h files)?'.format(YEL, DEF, BOLD, DEF)) + yn_prompt() + + +def confirm_commit(commit): + cmd = split('git --no-pager -c color.ui=always show {} --pretty=fuller --quiet'.format(commit)) + proc = run(cmd) + if proc.returncode: + exit(proc.returncode) + print('{}Submission Commit{}: Are you {}sure{} this is the commit you wish to submit?'.format(YEL, DEF, BOLD, DEF)) + yn_prompt() + + +def confirm_submission(tag): + print('{}Confirm Submission{}: Are you {}sure{} you want to submit {}? Your previous submission (if any) will be ' + '{}overwritten{}!'.format(YEL, DEF, BOLD, DEF, CYA + tag + DEF, BOLD, DEF)) + yn_prompt() + + +def trigger_submission(args): + info = api_call('', args) + return info['late'], info['lateDays'] + + +def submission_info(**kwargs): + run(split('git pull --quiet')) + late = kwargs['late'] + tag = kwargs['tag'] + resubmit = kwargs['submit'] + late_days = kwargs['days'] + attempt_time = kwargs['time'] + if late: + print('{}Urgent{}: {} was overdue. You did not have enough late days remaining.'.format(RED, DEF, tag)) + print('{}Info{}: You have {} lateday(s) remaining.'.format(BLU, DEF, late_days)) + if resubmit: + print('{}Alert{}: Your last submission will be taken into consideration.'.format(YEL, DEF)) + else: + print('{}Urgent{}: You will be given an automatic zero for this assignment. Please meet your Professor\n' + 'after lecture or during office hours or please email us at cse320@cs.stonybrook.edu with\n' + '{}[CSE320] - {} Overdue{} as the subject.'.format(RED, DEF, BLU, tag, DEF)) + else: + print('{}Success{}: {} submission successful. Your assignment was submitted on {}.' + .format(GRE, DEF, tag, attempt_time)) + print('{}Info{}: You have {} lateday(s) remaining.'.format(BLU, DEF, late_days)) + print('Thank you for submitting your homework! We are working hard to get your grades out as soon as possible.') + print('If you have any concerns please email us at cse320@cs.stonybrook.edu with {}[{}]{} in the subject.' + .format(BLU, tag, DEF)) + print('{}The CSE 320 Team{}'.format(PUR, DEF)) + + +def main(arg_parser): + requests.packages.urllib3.disable_warnings(InsecureRequestWarning) + first_api_call() + initial_message() + net_id = validate_git_directory() + args = arg_parser.parse_args() + attempt_time = datetime.now() + validate_tag(args.TAG) + validate_commit(args.commit) + resubmit, time = check_for_resubmission({'tag': args.TAG, 'repo': net_id, 'attemptTime': attempt_time.isoformat()}) + confirm_repo_state() + confirm_commit(args.commit) + confirm_submission(args.TAG) + signal(SIGINT, SIG_IGN) + signal(SIGTSTP, SIG_IGN) + late, late_days = trigger_submission({'tag': args.TAG, 'repo': net_id, 'commit': args.commit}) + submission_info(tag=args.TAG, time=time, submit=resubmit, late=late, days=late_days) + exit(0) + + +class GitArgParser(ArgumentParser): + def error(self, message): + print('{}Error{}: {}'.format(RED, DEF, message)) + self.print_help() + exit(2) + + +if __name__ == '__main__': + parser = GitArgParser(prog='git submit', description='Submit your homework assignment using git.') + parser.add_argument('TAG', type=str, help='The homework you wish to submit. Can have one of the following values: ' + 'hw0, hw1, hw2, hw3, hw4, hw5') + parser.add_argument('-c', dest='commit', type=str, required=False, default='master', + help='Used if you wish to submit a commit that is not the latest. COMMIT is the SHA value of ' + 'your commit.') + main(parser) diff --git a/course_tools/vm-setup.sh b/course_tools/vm-setup.sh new file mode 100755 index 0000000..2567c8e --- /dev/null +++ b/course_tools/vm-setup.sh @@ -0,0 +1,124 @@ +#! /usr/bin/env bash + +echo +if hash figlet 2> /dev/null; then + echo "320 Setup" | figlet -f banner +else + echo "320 Setup" +fi +echo + +echo "Updating..." +# General updates +sudo apt-get update -y +echo "Graphics issues from 8/2021 now fixed with VirtualBox 6.1.27 or later -- doing apt-get upgrade" +sudo apt-get upgrade -y +sudo apt-get autoremove -y 2>&1 > /dev/null + +# Extras +sudo apt-get install -y figlet terminator htop dialog wget tree 2>&1 > /dev/null + +echo "Installing..." +echo "vm-tools" | figlet -f mini +sudo apt-get install -y open-vm-tools-desktop 2>&1 > /dev/null + +echo "Installing..." +echo "Git" | figlet -f mini +sudo apt-get install -y git 2>&1 > /dev/null + +echo "Installing..." +echo "Gitk" | figlet -f mini +sudo apt-get install -y gitk 2>&1 > /dev/null + +echo "Installing..." +echo Git Submit | figlet -f mini +mkdir -p $HOME/.local +mkdir -p $HOME/.local/bin +cp -p git-submit $HOME/.local/bin +chmod +x $HOME/.local/bin/git-submit + +echo "Installing..." +echo "Readline" | figlet -f mini +sudo apt-get install -y libreadline-dev readline-doc 2>&1 > /dev/null + +echo "Installing..." +echo "Clang" | figlet -f mini +sudo apt-get install -y clang 2>&1 > /dev/null + +echo "Installing..." +echo "GDB" | figlet -f mini +sudo apt-get install -y gdb cgdb 2>&1 > /dev/null + +echo "Installing..." +echo "Valgrind" | figlet -f mini +sudo apt-get install -y valgrind 2>&1 > /dev/null + +echo "Installing..." +echo "GCC and tools" | figlet -f mini +sudo apt-get install -y gcc make binutils 2>&1 > /dev/null + +echo "Installing..." +echo "POSIX man pages" | figlet -f mini +sudo apt-get install -y manpages-posix-dev 2>&1 > /dev/null + +echo "Installing..." +echo "Ncurses" | figlet -f mini +sudo apt-get install -y libncurses-dev 2>&1 > /dev/null + +echo "Installing..." +echo "Criterion" | figlet -f mini +#sudo add-apt-repository -y ppa:snaipewastaken/ppa +#sudo apt-get update +#sudo apt-get install -y criterion-dev +sudo unzip -d / criterion.zip + +dialog --keep-tite --title "Sublime Text" --yesno "Do you want to install Sublime with plugins?" 5 50 + +if [ $? -eq 0 ]; then + + # Add Sublime key + wget -qO - https://download.sublimetext.com/sublimehq-pub.gpg | sudo apt-key add - 2>&1 > /dev/null + echo "deb https://download.sublimetext.com/ apt/stable/" | sudo tee /etc/apt/sources.list.d/sublime-text.list 2>&1 > /dev/null + + sudo apt-get update -y + + echo "Installing..." + echo "Sublime Editor" | figlet -f mini + sudo apt-get install sublime-text 2>&1 > /dev/null + + mkdir -p "$HOME/.config" + mkdir -p "$HOME/.config/sublime-text-3" + mkdir -p "$HOME/.config/sublime-text-3/Installed Packages" + mkdir -p "$HOME/.config/sublime-text-3/Packages" + mkdir -p "$HOME/.config/sublime-text-3/Packages/User" + + # EWS (8/8/2020) + # The following configuration is somewhat obsolete, and in any case is too sensitive to + # changes to Sublime and its plugins. When I tried to run it just now, it left things + # broken. Instead I have manually installed the following for use in CSE 320: + # + # Package Control + # SublimeLinter + # SublimeLinter-gcc + # TrailingSpaces + # + # The only really important thing is to copy SublimeLinter.sublime-settings to + # ~/.config/sublime-text-3/Packages/User as in the last commented line below, so that + # the linter works correctly with our project setup. + # + # The rest of this stuff I have commented out for now. + # + #touch "$HOME/.config/sublime-text-3/Packages/User/Package Control.sublime-settings" + #touch "$HOME/.config/sublime-text-3/Installed Packages/Package Control.sublime-package" + #wget -qO - https://packagecontrol.io/Package%20Control.sublime-package > "$HOME/.config/sublime-text-3/Installed Packages/Package Control.sublime-package" + # + #echo "{\"bootstrapped\":true,\"installed_packages\":[\"Package Control\",\"TrailingSpaces\",\"SublimeLinter\",\"SublimeLinter-contrib-gcc\"]}" > "$HOME/.config/sublime-text-3/Packages/User/Package Control.sublime-settings" + #echo "{\"trailing_spaces_trim_on_save\": true}" > "$HOME/.config/sublime-text-3/Packages/User/trailing_spaces.sublime-settings" + #echo "{\"ignored_packages\":[\"Vintage\"],\"hot_exit\":false,\"save_on_focus_lost\":true,\"translate_tabs_to_spaces\":true}" > "$HOME/.config/sublime-text-3/Packages/User/Preferences.sublime-settings" + cp -p SublimeLinter.sublime-settings ~/.config/sublime-text-3/Packages/User +fi +echo "-----------------------------" +echo "!ATTN!" | figlet +echo "-----------------------------" +echo -e "If you \e[31;1mcannot\e[0m execute git submit add the following to your ~/.bashrc or other relevant terminal config" +echo "export PATH=\$PATH:$HOME/.local/bin" diff --git a/hw0-doc/README.md b/hw0-doc/README.md new file mode 100644 index 0000000..4b83295 --- /dev/null +++ b/hw0-doc/README.md @@ -0,0 +1,754 @@ +# CSE320 Spring 2022 + +In this course you will be using Linux as your primary development +environment. In addition, we will be providing you with a git +repository hosted on a department GitLab server. This document will +briefly explain the course tools and outline the required setup for +this course. + +## Setting up your CSE320 Git repository + +Git is an open-source distributed version control system. We will use +git repositories to manage your homework submissions. In addition, the +use of git allows the Professor and TAs to access and view your your +code remotely in order to assist you. While some students may be +familiar with version control and git, we ask that everyone complete +the following tutorial and instructions. This will ensure that +everyone in the course has the same background knowledge and can +submit their homeworks. + +We are using a CSE department supported git web interface, called +gitlab. This is similar to github, bitbucket, etc. It is an interface +to help manage git repositories. These services are INTERFACES to git, +not git itself. You *may not* use external repositories as we will +use the repo provided to you to grade your submitted work and share +gradesheets with you. + +To setup your repository: + +1. Navigate to +[https://gitlab02.cs.stonybrook.edu](https://gitlab02.cs.stonybrook.edu/) +and log into it with your CS email account (user name only, do not +include the `@cs.stonybrook.edu`). If you forgot your CS email +password you can reset it by following the instructions +[here](https://auth01.cs.stonybrook.edu:10443/). If those +instructions fail, please email `rt@cs.stonybrook.edu` requesting a +password reset. A response may take up to 24-48 hours. +2. Once you have logged in the creation of your repo will be triggered. +Normally this will occur within a few minutes. If not, then send an +email to `cse320@cs.stonybrook.edu` and we will look into it. +Sometimes the 'bot responsible for creating the repos has to be reset. + +## Setting up Linux Environment + +Since C is a systems level language, frequently the behavior from one +person’s computer to another can vary. In the past, we have provided a +common server for students to use, but this presented a few +problems. When you wanted to compile your assignment, you would have +to continuously transfer the file to the server and then compile +it. If you had any mistakes, you would have to either edit it on the +server or make the change locally and upload it again. This became +very tedious which often led to students compiling and testing +locally on their own machines. This was not always a good idea as +something that seemed to work for you didn’t always work for the +grader which caused many issues. Also, many tools, which assist in +locating and fixing errors in C code, do not exist in Windows and OSX +environments. So students who installed operating systems such as +[Linux](https://en.wikipedia.org/wiki/Linux) were at an advantage over +the students who did not. + +> :nerd: This document will also outline the homework management and + submission process. In this class, you will be creating increasingly + complex C projects which may involve many files. To satisfy these + requirements, we will be using git to manage & submit your homework + assignments. + +> :nerd: While we will try to provide the basics for what needs to be + done, it will ultimately be up to you to learn how to use these + tools. + +To help alleviate the above issues and to setup a local environment +with the necessary course tools, you must install your working +environment using one of these two options: + +- Option 1: A Virtual Machine running Linux (Encouraged Option) +- Option 2: Multi-Boot/Install Linux on your machine + +Option 1 is encouraged for the following reasons: +- Quick setup +- Ease of use in your native OS +- Easy to reset if errors in VM environment +- All course tools are pre-installed +- Simulate multiple cores on a single core system + +We have put a lot of effort into setting up a pre-configured VM. If +for some reason you are unable or unwilling to use this, we have +provided basic instructions for Option 2 with a script to install all +the course tools. + +If you choose option 2, you should have some idea what you are doing, +already be comfortable with Linux, and be aware that we probably won't +have the resources to debug any issues you might encounter. If you +deviate in any other way from these procedures, it is completely at +your peril. + +### Option 1: A Virtual Machine running Linux + +Students often use either [VMware](https://www.vmware.com) or +[VirtualBox](https://www.virtualbox.org/) to run virtual machines. +We recommend that you use VirtualBox. It is free, and it runs +on all of the most popular platforms. + +In order to run a virtual machine, your machine must support 64-bit +hardware virtualization. Most machines built after 2006 should support +this. However, not all machines have the option enabled. You may need +to modify your BIOS settings to enable this feature. As each machine +has a different BIOS, it is up to you to find and enable this feature +on your own machine. + +Download and install the VirtualBox platform package appropriate +for your computer from [this site](https://www.virtualbox.org/wiki/Downloads). + +> :exclamation: Because of recent changes made to the way VirtualBox interfaces +> with the graphics drivers on various platforms, it is important that you make +> sure to install VirtualBox version 6.1.27 or greater. With older versions, +> the course VM image will probably not be able to access the display properly. + +#### Running the Linux VM + +We will be using Linux Mint 20 "Ulyana" -- Cinnamon as this semester's OS. We +have taken the time to set up the VM so it simply needs to be opened +in your virtualization program. The provided Linux virtual machine +has all the tools required for various aspects of this course; for +example, homework submission is pre-installed. + +To get started, download the VM from here: +[Google Drive] +(https://drive.google.com/file/d/1rwUM_rm4sEC-we-i-siOPWDnQEtH6mdC/view?usp=sharing) +(it's nearly 5 gb so give it some time). +This should result in your having a file called **CSE320_Spring22.ova**. +This can be imported directly into VirtualBox by choosing +"Import Appliance" from the "File" menu and then browsing to select +the file you downloaded. Click "Next", review the VM settings, +and then click on "Import". Once the import has completed, you should +have a VM called "CSE 320". Select this and click on +"Start" to boot the VM. + +#### Login Info + +Upon booting, you will be automatically logged in as user `student`. +The login info for your reference is: + + +| Username | Password | +|:----------------|:-------------| +| `student` | `cse320` | + +You will need the password in order to obtain superuser access via `sudo` +to install software, and you might need to enter both the user name and +the password if the screen lock should kick in after you have left the VM +idle for some time. + +#### VirtualBox Guest Additions + +The VirtualBox Guest Additions are software components that are added +to the guest operating system that runs in your VM, to make the VM more +convenient to use. Examples of things in the Guest Additions are accelerated +video drivers, support for clipboard and drag-and-drop between the VM +and the host system, ability to resize the VM window, and so on. +There is a version of the Guest Additions installed in the VM, +but since the Guest Additions need to match the version of VirtualBox +that you are using, you should reinstall them. To do this, you should +start the VM, then from the "Devices" menu (probably in the titlebar of +the VM window, or wherever top-level application menus appear on your +system) select "Insert Guest Additions CD Image". This might cause +a CD image to be downloaded over the network. If the system offers to +auto-run the CD, allow it to do so. Otherwise you might have to use +file manager (under Linux Mint) to open the CD manually. Once started, +it can take several minutes for the installation to complete. + +#### VM Snapshots + +If you choose to install additional tools or other programs to your +environment, you may want to take a snapshot of your VM. This may save +you the time of installing your additional software again, in the +unfortunate event of an unusable VM. Refer to the appropriate VirtualBox +documentation to learn how to take a snapshot of your VM. + +### Option 2: Multi-Boot/Install Linux on your machine + +> Remember, if you choose this option, you should have some idea what + you are doing, already be comfortable with Linux, and be aware that + we probably won't have the resources to debug any issues you might + encounter. If you deviate in any other way from these procedures, + it is completely at your peril. + +Install [Linux Mint 20 "Ulyana" - Cinnamon 64-bit](https://linuxmint.com/edition.php?id=281) +or 20.04 Ubuntu variant (as long as you are using gcc 9.3.0) as a dual-boot or fresh +install. + +Clone the [CSE320 course tools](https://gitlab02.cs.stonybrook.edu/cse320/course_tools) +(https://gitlab02.cs.stonybrook.edu/cse320/course_tools) repository +into your Linux environment. You may need to install git first. + +Follow the README in the `course_tools` repo. + +#### Note about MacOS with Apple M1 Processor + +We are aware that a number of students are now using Macs with an M1 processor. +The M1 hardware uses the ARM instruction set, which is different than the +x86-64 instruction set which the course Linux Mint VM uses. At the time of this +writing, we do not have any reliable information that would indicate that it would +be possible to run the Linux Mint VM on an M1. It might be possible to run +it using QEMU, which is a full x86-64 emulator that is independent of the +underlying host system hardware and for which versions exist for Macs running +on the M1, though to date we do not have any information from anyone who has +succeeded in running the VM this way (please tell us if you have managed to do it). +However, even if in fact the VM can be run this way it is likely to be very slow. +So, our best advice at this time would be to try to identify some x86-64-based +computer that you can use for the course, rather than supposing that you will +be able to use an M1-based computer. + +### Working in Unix + +We understand that many of the students taking this class are new to +CLI (Command-line interface). You can find a quick crash course in +[Appendix A of Learn Python the Hard Way](https://learnpythonthehardway.org/book/appendixa.html). + +> :nerd: For more advanced usage refer + [here](http://www.ibm.com/developerworks/library/l-lpic1-103-1/). This + is a REALLY good resource so we recommend bookmarking it for later + reference. + +> :nerd: It is **very** important that you properly shut down the Linux Mint + operating system when you are finished using it, rather than just + "X-ing out" the VirtualBox VM window. The latter is equivalent to + going and yanking your desktop PC's power plug out of the wall without + shutting down Windows, and it can cause data loss and even corruption. + Use the shutdown icon from the "Mint" menu in the lower left corner + of the desktop to shutdown Linux Mint. At that point, it will be safe + to power off the VM. + +> :nerd: Depending on the host system on which you installed VirtualBox, + "keyboard integration" and "mouse integration" might or might not be + supported. If they are supported, then you will be able to fairly + seamlessly move your mouse in and out of the VM window and what you + type on the keyboard will go to the proper place. If these features + are not supported, then you will need to click on the VM window in + order to use it, at which point the mouse and keyboard will be "captured" + by the VM. In order to regain control of the mouse and cursor, you + will need to press the "host key", which is identified at the right-hand + side of the bottom icon tray of the VirtualBox window. On some systems, + the default host key is "Right Ctrl". + +> :nerd: To open a terminal window, you can click on the terminal + icon (which should be fairly evident), or you can press CTRL + ALT + T. + +#### Text Editor + +A _good_ basic text editor is the key for C development. + +We have pre-installed Sublime Text with plugins such as a C linter to +assist with C development on the given VM. A linter displays compiler +errors on top of your code much like an IDE. If you do install another +editor we recommend looking into a similar feature described as it +will aid development. + +You may use another text editor if you so desire. Some popular ones +are Atom, Vim, Emacs and VSCode. Each have their own linters that you +can look into installing. + +**DO NOT** install and use a full IDE (Clion, Netbeans, or Eclipse); + there are many parts of the compilation process that are hidden from + you. Not only would you miss out on valuable information pertinent + to the course but your project is not guaranteed to build in an + environment separate from the IDE. + +## Homework Management & Submission + +#### Setting up your CSE320 repository + +Once your repository has been created on gitlab, you must clone it in +your Linux environment. Open a new terminal window +and type `git clone GIT_URL`. You should replace `GIT_URL` with the +URL to your repository. You can find it by navigating to your projects +page on GitLab and selecting the https option. + +> Your repo should be cloned into your home directory (`/home/student/` or AKA `~/`) + + Alternatively if you add an ssh-key to your gitlab account you can + clone, pull, push, etc. using the URL under the SSH option (**highly + recommended** An SSH key can be done at any time). + + Reference: + - [Generating SSH key](http://docs.gitlab.com/ce/ssh/README.html) + +#### First Commit to your Repo + +Open a terminal and from the home directory enter the following command: +(replacing REPO_NAME with your repo's name) + +
+$ subl REPO_NAME
+
+ +The text editor, Sublime, will open and your repo's contents will be +shown on the sidebar. Open the `README.md` file and add the text with +the following information relevant to you. + +```markdown +# FIRST_NAME LAST_NAME +## ID_NUMBER +:FAVORITE_EMOJI: +PROFESSOR_NAME - SECTION_NUMBER +``` + +You can find your favorite emoji code among these +[https://gist.github.com/rxaviers/7360908](https://gist.github.com/rxaviers/7360908). +After that you can save and close the file and return to your terminal. + +In your terminal, type the following commands, replacing `EMAIL` with +your CS email address and `NAME` with your name: + +
+$ git config --global user.email "EMAIL"
+$ git config --global user.name "FIRST_NAME LAST_NAME"
+$ git config --global push.default simple
+
+ +**NOTE:** This will change your settings for all repos. If you want to + have different settings for other repos on your machine then omit + `--global` + +Change directories into your repo `cd REPO_NAME` + +Then run the following commands: + +
+$ git status
+$ git add README.md
+$ git commit -m "My First Commit"
+$ git push
+
+ +> The `git push` command will prompt for username and password if you used HTTPS. + +The output will look **similar** to: + +
+$ git status
+On branch master
+Your branch is up-to-date with 'origin/master'.
+Changes not staged for commit:
+  (use "git add file..." to update what will be committed)
+  (use "git checkout -- file..." to discard changes in working directory)
+
+    modified:   README.md
+
+no changes added to commit (use "git add" and/or "git commit -a")
+$ git add README.md
+$ git commit -m "My First Commit"
+[master XXXXXXX] My First Commit
+ 1 files changed, X insertions(+), X deletions(-)
+$ git push
+Counting objects: 4, done.
+Delta compression using up to 4 threads.
+Compressing objects: 100% (4/4), done.
+Writing objects: 100% (4/4), 980 bytes | 0 bytes/s, done.
+Total 4 (delta 2), reused 0 (delta 0)
+To ssh://git@gitlab02.cs.stonybrook.edu:130/CSE320_Fall20/REPONAME.git
+   XXXXXXX..XXXXXXX  master -> master
+Branch master set up to track remote branch master from origin.
+$
+
+ +This is the basic usage of git. We check the `status` of which files +are tracked/untracked. Then we `add` them and we `commit` them along +with a message. Lastly and most importantly we `push` them to the +remote repository on the gitlab server. If the push was successful, +you can navigate back to the page `https://gitlab02.cs.stonybrook.edu` +and select your repository. Inside your repository, select the files +option on the left menu. You should now see the file `README.md` with +the contents you added to it. + +> :scream: Once a commit has been made, its contents cannot be changed. +> In addition, the GitLab server has been configured so that it is not +> possible to delete any commits that have been pushed to the "master" +> branch. This means that any junk you commit to the master branch and +> push to the server will persist there forever in your repo, as well as +> in copies that we have to store. In view of this, it is is important that +> you take great care not to commit junk files, especially files that are +> very large or binary files that are generated by the compiler. +> Each time you commit, you should first use `git status` to carefully review +> the set of files to be committed. Use `git reset` to remove any files that +> are staged for commit but should not be. We strongly recommend that you +> *never* use commands such as `git add .` or `git add --all`, as these have +> the potential to add a lot of junk to your commit. Instead, `git add` each +> file individually, after perhaps using `git diff` to remind yourself of the +> reason for the commit and to see if the changes are as they should be. + +#### Git Tutorial + +We recommend you complete Codecademy’s git tutorial found +[here](https://www.codecademy.com/learn/learn-git) if you are +unfamilar with git. + +If you’re interested in learning more information about git or +expanding your knowledge, refer to these references: +- [git-book](https://git-scm.com/book/en/v2) - Chapter 2 is a MUST + read chapter, checkout git aliases! +- [Learn Git Branching](http://learngitbranching.js.org/) - An + interactive tutorial on git branching +- [git cheat sheet](https://scotch.io/bar-talk/git-cheat-sheet) + +# Homework 0 + +#### Obtaining Assignment Code + +1. Navigate to your repository directory (`cd ~/REPO_NAME`) in your VM +(using the terminal). + +2. An assignment, such as this one, will tell you the code is located +at a particular address. For `hw0` it is: +`https://gitlab02.cs.stonybrook.edu/cse320/hw0.git` + +3. Add this remote repository as an additional remote into your +existing repository. We will name the new remote HW0_CODE. + + If you use HTTPS: + + ``` + $ git remote add HW0_CODE https://gitlab02.cs.stonybrook.edu/cse320/hw0.git + ``` + + If you use SSH: + + ``` + $ git remote add HW0_CODE ssh://git@gitlab02.cs.stonybrook.edu:130/cse320/hw0.git + ``` + +4. Fetch all the refs in this new repository. This command will prompt +for username and password if you used HTTPS. + + ``` + $ git fetch HW0_CODE + ``` + +5. Finally, merge and commit the files from the `HW0_CODE` remote's +`master` branch into your existing repository’s `master` branch. + + ``` + $ git merge -m "Merging HW0_CODE" HW0_CODE/master + ``` + + > :nerd: If you get an error mentioning 'unrelated histories' try + again adding this flag: `--allow-unrelated-histories` + +6. If you type the command `ls` you should now see a directory called `hw0`. + +7. Push these base files to your remote repository (gitlab). This +command will prompt for username and password if you used HTTPS. + + ``` + $ git push + ``` + +#### Your Homework 0 Working Directory + +The directory structure of your repo will now look **similar** to +this. Use `ls -a` or `tree -a` to see the hidden files that begin with +`.` + +
+YOUR_REPO
+├── .git
+│   ├── ...
+├── .gitignore
+├── .gitlab-ci.yml
+├── hw0
+│   ├── academic_honesty.txt
+│   ├── include
+│   │   └── hi.h
+│   ├── Makefile
+│   ├── README.md
+│   ├── src
+│   │   ├── hi.c
+│   │   └── main.c
+│   └── tests
+│       └── test.c
+└── README.md
+
+ +Information about each file is explained below. + +> :nerd: Enter `subl REPO_NAME` (or `subl .` if you are in your repo + already) as you did before to easily follow along and look inside + each file + +- `.gitignore` - This is a file that tells git to ignore certain + directories or files that you don't want committed. For example, the + `bin` and `build` directories are ignored. + This is because we don't want executables and other generated binary files + pushed to your remote repository, only source code. +- `.gitlab-ci.yml` This is gitlab's own continuous integration + configuration file, explained in a later section. +- `hw0/` - This is your first homework directory, throughout the + semester we'll be adding each homework directory in this fashion + 'hw#' where # is the homework number. Inside the `hw0/` directory, + you will find: + - `README.md` - This is a file where you can detail notes about the project. + - `Makefile` - This is your ultimate compilation automation + tool. The program `make` will use this file to properly compile + your assignment. + - `include/` - This is where we keep our `.h` headers. Unlike + Java, C is a one pass compilation language, which keeps the + overhead of creating symbol tables and structures low. Since all + functions must be defined before use, we utilize a header file + to make the symbols available across multiple files. + - `hi.h` - This is our header file for `hw0`. **Examine the + contents of this file.** + - `src/` - This is where we keep our `.c` source files. These + files contain the actual definitions of the functions declared + in our headers. + - `main.c` - This file contains the C main function, in this + course you will **ALWAYS** need to keep your main function + in its own C file isolated from the rest of your functions + that you implement. + - `hi.c` - The helper function, `hi`, is defined (implemented) + here. Each function **does not** need its own file, only + `main()` needs to be in its own file. + - `tests/` - This is where we keep our unit tests. There is an + EXCELLENT unit testing framework called + [criterion](http://criterion.readthedocs.io/en/master/intro.html) + that we will be using in the course. + - `test.c` - This file contains the implementation of our + testing framework. The Makefile will compile these files + with all of the non-`main.c` files. This gives the testing + framework access to your helper functions. + +> **Do not** modify or alter `.gitlab-ci.yml`, the `Makefile` or the + `README.md` for any assignment unless otherwise instructed. + +### Academic Honesty Statement + +In this course we take Academic Honesty EXTREMELY seriously. Read the +statement in `academic_honesty.txt` using your favorite text editor or +using the following command (type `man cat` for more information on +this tool). + +From your repository's root directory type: +
+$ cat hw0/academic_honesty.txt
+
+ +Next, we will append the Academic Honesty Statement into your +repository's README along with the date and your "signature" +confirming that you have read the statement and agree with the policy +and commit it to your repo. + +From your repository's root directory type the following commands into +your terminal, filling in `YOUR_NAME` with the appropriate information +in the second command. + +> :nerd: The second "crazy" command is an example of redirection which + can be done between programs on the command-line. We will learn more + about redirection and how it works later in the semester. + +
+$ cd hw0
+$ cat academic_honesty.txt <(echo "$(date -u) - YOUR_NAME") >> ../README.md
+$ git add --all
+$ git commit -m "Academic Honesty statement"
+$ git push
+
+ +#### CI File + +This semester we want to ensure that students don't get caught by +silly mistakes or overlook anything while working on their +assignments. We will use GitLab's [_Continuous +Integration_](https://en.wikipedia.org/wiki/Continuous_integration) +feature to minimize such incidents. It is an automated tool that will +make sure your work compiles and passes basic tests. + +There is a `.gitlab-ci.yml` file in the base code. This file is used +to set up a clean vm, compile your code, run your unit tests, and +lastly run your program on the gitlab server. When looking on gitlab +you will notice a red :x: or green :heavy_check_mark:. Each represents +the result of a 'CI pipeline'. Go to the Pipelines tab to view the +results of your run. We will provide this file with each homework. The +CI runs when gitlab "gets around" to doing it so don't be alarmed if +you don't see your result right away. Also, note that sometimes +a "Runner System Failure" might occur due to problems on the server. +A failure reported by CI system does *not* mean that "your commit failed" +in the sense that what you committed and pushed failed to make it to the +gitlab server; it means that an error occurred while the system was trying +to compile and run what you committed. You can always use the `Repository` +tab of the gitlab web interface to see what commits you have pushed to the +server. If a commit is shown there, then it is safely on the server. + +#### Hello, World! + +In the terminal, navigate into the `hw0` folder of your repository +(ie. `~/REPO_NAME/hw0/`). Open the file `include/hi.h` and examine its +contents, read the comments, and follow the directions in the +files. These directions step you through the base files in the HW to +familiarize you with basic files structure and the included code +complements. + +In the terminal, type `make` to build the `hw0` base code. This will +compile your c code into executables. + +
+$ make
+mkdir -p bin build
+gcc build/hi.o build/main.o -o bin/hi
+gcc -Wall -Werror -std=gnu11 -g -DDEBUG -I include build/hi.o tests/test.c -lcriterion -o bin/hi_tests
+
+ +An executable named `hi` and `hi_tests` will be created in the `bin` +folder of the `hw0` directory. You can execute either program by +typing `bin/hi` or `bin/hi_tests` from the `hw0` directory into the +terminal. + +Running `bin/hi` will print "Hello, World!" before exiting. Running +`bin/hi_tests` will fail a unit test and print the warning `"Assertion +failed: say_hi() function did not say 'Hi'"`. + +
+$ bin/hi
+Hello, World!
+$ bin/hi_tests
+[----] tests/test.c:15: Assertion failed: say_hi() function did not say 'Hi'
+[FAIL] CSE320_Suite::test_it_really_does_say_hi: (0.00s)
+[====] Synthesis: Tested: 1 | Passing: 0 | Failing: 1 | Crashing: 0
+
+ +To do this assignment, modify the `say_hi()` function in `src/hi.c` to +satisfy the unit test (i.e. `return "Hi"`). This is will now make the +program do what the unit test expects and as a result pass the unit +test. + +Rebuild the hw0 executables by typing in ‘make’ to your terminal. Run +the program again to make sure it satisfies requirements + +
+$ make
+mkdir -p bin build
+gcc -Wall -Werror -std=gnu11 -g -DDEBUG -I include -c -o build/hi.o src/hi.c
+gcc build/hi.o build/main.o -o bin/hi
+gcc -Wall -Werror -std=gnu11 -g -DDEBUG -I include build/hi.o tests/test.c -lcriterion -o bin/hi_tests
+$ bin/hi
+Hi, World!
+$ bin/hi_tests
+[====] Synthesis: Tested: 1 | Passing: 1 | Failing: 0 | Crashing: 0
+
+ +To save your code changes, `add` them to the staging area of +git. `Commit` the changes to a local commit on your VM. Then `push` +the commits to the remote server (gitlab). + +You can be sure that everything compiles correctly if at some point a +check appears next to your repo on gitlab. + +
+$ git status
+On branch master
+Your branch is up-to-date with 'origin/master'.
+Changes not staged for commit:
+  (use "git add file..." to update what will be committed)
+  (use "git checkout -- file..." to discard changes in working directory)
+
+	modified:   src/hi.c
+
+no changes added to commit (use "git add" and/or "git commit -a")
+$ git add src/hi.c
+$ git commit -m "Hi Fix"
+[master XXXXXXX] updated hi.c
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+$ git push
+Username for 'https://gitlab02.cs.stonybrook.edu': REPO_NAME
+Password for 'https://REPO_NAME@gitlab02.cs.stonybrook.edu':
+
+ +## How to submit with `git submit` + +This semester you will be submitting all assignments using our custom +git command: `git submit` + +The usage for `git submit` is: + +
+$ git submit [-c COMMIT_HASH] TAG
+
+ +The **TAG** is which assignment you are tagging, for example: `hw0` or +`hw1` or `hw2`. This is the format for all tags (with a few +exceptions) that we will use this semester `hw#` where `#` is the +assignment number. + +The `-c` flag is optional. **COMMIT** is the SHA of the commit you +wish to submit. In case you wanted to submit a different commit than +your most current one you would just provide the SHA for the commit to +be submitted. You can view your commit SHAs using the following +command: + +> The SHA is the alphanumeric string before the first hyphen. + +
+$ git log --pretty=format:"%h - %s - %ar"
+
+ +With `git submit`, you may: + +* Submit your assignment **only** from the master branch. + +* The commits that you are submitting must be on the master branch or + merged into master (commit hash is on master). + +* You may use other branches if you wish, but you cannot use + git-submit from these branches. + +
+$ git submit hw0
+
+ +This will submit your latest pushed commit for `hw0`. You can submit +as many times as you wish prior to any deadline. + +#### How to check your submission + +It creates a special branch in your repo called 'submit'. The only +reason it is special is that only instructors have permission to push +to it, hence why you need a special tool to submit your assignment. So +the submit tool tags the commit that you want to submit and merges +that commit into the submit branch under an authorized user. + +Also, you should see a submission commit on the 'submit' branch. The +submission commit is a commit with a commit message formatted as +" submission commit". This you can see by navigating to +Repository > Commits and selecting the 'submit' branch from the +dropdown. + +If you see both of these things, you have successfully submitted the +assignment. A successfully submitted homework assignment will have a +tag for that particular homework which you can see if you go to your +project in Gitlab and navigate to Repository > Tags. + +--------------------------------------------- + +These are the tools and the dev environment you will be working with +for the rest of the semester. We have provided you with these tools to +ease the protocols of this course and prevent mishaps from +occurring. + +So that you do not delay your ability to start on `hw1`, you should make +every effort to complete the steps above by **February 4, 2022**, 11:59PM. +After that date it will no longer be possible to `git submit hw0`, +however you should still make the commits to your repository described +above prior to submitting `hw1`. +Although `hw0` will not be formally scored and weighted into the final grade +calculations, you will not receive credit for any other assignments +in this course unless the signed Academic Honesty statement has previously +been committed to your repository and thereby incorporated into the +submissions for those assignments. diff --git a/hw1-doc/ECMA-404_2nd_edition_december_2017.pdf b/hw1-doc/ECMA-404_2nd_edition_december_2017.pdf new file mode 100644 index 0000000..c69ce49 Binary files /dev/null and b/hw1-doc/ECMA-404_2nd_edition_december_2017.pdf differ diff --git a/hw1-doc/README.md b/hw1-doc/README.md new file mode 100644 index 0000000..026350f --- /dev/null +++ b/hw1-doc/README.md @@ -0,0 +1,1196 @@ +# Homework 1 - CSE 320 - Spring 2022 +#### Professor Eugene Stark + +### **Due Date: Friday 02/18/2022 @ 11:59pm** + +**Read the entire doc before you start** + +## Introduction + +In this assignment, you will implement functions for parsing JSON input +and building a data structure to represent its contents and for traversing +the data structure and producing JSON output. +You will use these functions to implement a command-line utility +(called `argo`) +which can validate JSON input and transform JSON input into JSON output +in a "canonical" form. +The goal of this homework is to familiarize yourself with C programming, +with a focus on input/output, bitwise manipulations, and the use of pointers. + +For all assignments in this course, you **MUST NOT** put any of the functions +that you write into the `main.c` file. The file `main.c` **MUST ONLY** contain +`#include`s, local `#define`s and the `main` function (you may of course modify +the `main` function body). The reason for this restriction has to do with our +use of the Criterion library to test your code. +Beyond this, you may have as many or as few additional `.c` files in the `src` +directory as you wish. Also, you may declare as many or as few headers as you wish. +Note, however, that header and `.c` files distributed with the assignment base code +often contain a comment at the beginning which states that they are not to be +modified. **PLEASE** take note of these comments and do not modify any such files, +as they will be replaced by the original versions during grading. + +> :scream: Array indexing (**'A[]'**) is not allowed in this assignment. You +> **MUST USE** pointer arithmetic instead. All necessary arrays are declared in +> the `global.h` header file. You **MUST USE** these arrays. **DO NOT** create +> your own arrays. We **WILL** check for this. + +> :nerd: Reference for pointers: [https://beej.us/guide/bgc/html/#pointers](https://beej.us/guide/bgc/html/#pointers). + +# Getting Started + +Fetch base code for `hw1` as described in `hw0`. You can find it at this link: +[https://gitlab02.cs.stonybrook.edu/cse320/hw1](https://gitlab02.cs.stonybrook.edu/cse320/hw1). +**IMPORTANT: 'FETCH', DO NOT 'CLONE'.** + +Both repos will probably have a file named `.gitlab-ci.yml` with different contents. +Simply merging these files will cause a merge conflict. To avoid this, we will +merge the repos using a flag so that the `.gitlab-ci.yml` found in the `hw1` +repo will replace the `hw0` version. To merge, use this command: + +``` +git merge -m "Merging HW1_CODE" HW1_CODE/master --strategy-option=theirs +``` + +> :scream: Based on past experience, many students will either ignore the above command or forget +> to use it. The result will be a **merge conflict**, which will be reported by git. +> Once a merge conflict has been reported, it is essential to correct it before committing +> (or to abort the merge without committing -- use `git merge --abort` and go back and try again), +> because git will have inserted markers into the files involved indicating the locations of the +> conflicts, and if you ignore this and commit anyway, you will end up with corrupted files. +> You should consider it important to read up at an early stage on merge conflicts with git and +> how to resolve them properly. + +Here is the structure of the base code: + +
+.
+├── .gitlab-ci.yml
+└── hw1
+    ├── .gitignore
+    ├── hw1.sublime-project
+    ├── include
+    │   ├── argo.h
+    │   ├── debug.h
+    │   └── global.h
+    ├── lib
+    │   └── argo.a
+    ├── Makefile
+    ├── rsrc
+    │   ├── numbers.json
+    │   ├── package-lock.json
+    │   └── strings.json
+    ├── src
+    │   ├── argo.c
+    │   ├── const.c
+    │   ├── main.c
+    │   └── validargs.c
+    ├── test_output
+    │   └── .git_keep
+    └── tests
+        ├── basecode_tests.c
+        └── rsrc
+            └── strings_-c.json
+
+ +- The `.gitlab-ci.yml` file is a file that specifies "continuous integration" testing +to be performed by the GitLab server each time you push a commit. Usually it will +be configured to check that your code builds and runs, and that any provided unit tests +are passed. You are free to change this file if you like. + +> :scream: The CI testing is for your own information; it does not directly have +> anything to do with assignment grading or whether your commit has been properly +> pushed to the server. If some part of the testing fails, you will see the somewhat +> misleading message "commit failed" on the GitLab web interface. +> This does **not** mean that "your attempt to commit has failed" or that "your commit +> didn't get pushed to the server"; the very fact that the testing was triggered at +> all means that you successfully pushed a commit. Rather, it means that "the CI tests +> performed on a commit that you pushed did not succeed". The purpose of the tests are +> to alert you to possible problems with your code; if you see that testing has failed +> it is worth investigating why that has happened. However, the tests can sometimes +> fail for reasons that are not your fault; for example, the entire CI "runner" system +> may fail if someone submits code that fills up the system disk. You should definitely +> try to understand why the tests have failed if they do, but it is not necessary to be +> overly obsessive about them. + +- The `hw1.sublime-project` file is a "project file" for use by the Sublime Text editor. +It is included to try to help Sublime understand the organization of the project so that +it can properly identify errors as you edit your code. + +- The `Makefile` is a configuration file for the `make` build utility, which is what +you should use to compile your code. In brief, `make` or `make all` will compile +anything that needs to be, `make debug` does the same except that it compiles the code +with options suitable for debugging, and `make clean` removes files that resulted from +a previous compilation. These "targets" can be combined; for example, you would use +`make clean debug` to ensure a complete clean and rebuild of everything for debugging. + +- The `include` directory contains C header files (with extension `.h`) that are used +by the code. Note that these files often contain `DO NOT MODIFY` instructions at the beginning. +You should observe these notices carefully where they appear. + +- The `src` directory contains C source files (with extension `.c`). + +- The `tests` directory contains C source code (and sometimes headers and other files) +that are used by the Criterion tests. + +- The `rsrc` directory contains some samples of data files that you can use for +testing purposes. + + - The `test_output` directory is a scratch directory where the Criterion tests can +put output files. You should not commit any files in this directory to your +`git` repository. + +- The `lib` directory contains a library with binaries for my functions +`argo_read_value()` and `argo_write_value()`. As discussed below, by commenting out +the stubs for these functions in `argo.c` you can arrange for my versions to be +linked with your code, which may help you to get a jump start on understanding +some things. + +## A Note about Program Output + +What a program does and does not print is VERY important. +In the UNIX world stringing together programs with piping and scripting is +commonplace. Although combining programs in this way is extremely powerful, it +means that each program must not print extraneous output. For example, you would +expect `ls` to output a list of files in a directory and nothing else. +Similarly, your program must follow the specifications for normal operation. +One part of our grading of this assignment will be to check whether your program +produces EXACTLY the specified output. If your program produces output that deviates +from the specifications, even in a minor way, or if it produces extraneous output +that was not part of the specifications, it will adversely impact your grade +in a significant way, so pay close attention. + +> :scream: Use the debug macro `debug` (described in the 320 reference document in the +> Piazza resources section) for any other program output or messages you many need +> while coding (e.g. debugging output). + +# Part 1: Program Operation and Argument Validation + +In this part of the assignment, you will write a function to validate the arguments +passed to your program via the command line. Your program will treat arguments +as follows: + +- If no flags are provided, you will display the usage and return with an +`EXIT_FAILURE` return code. + +- If the `-h` flag is provided, you will display the usage for the program and + exit with an `EXIT_SUCCESS` return code. + +- If the `-v` flag is provided, then the program will read data from standard input +(`stdin`) and validate that it is syntactically correct JSON. If so, the program +exits with an `EXIT_SUCCESS` return code, otherwise the program exits with an +`EXIT_FAILURE` return code. In the latter case, the program will print to +standard error (`stderr`) an error message describing the error that was discovered. +No other output is produced. + +- If the `-c` flag is provided, then the program performs the same function as +described for `-v`, but after validating the input, the program will also output +to standard output (`stdout`) a "canonicalized" version of the input. +"Canonicalized" means that the output is in a standard form in which possibilities +for variation have been eliminated. This is described in more detail below. +Unless `-p` has also been specified, then the produced output contains **no whitespace** +(except within strings that contain whitespace characters). + +- If the `-p` flag is provided, then the `-c` flag must also have been provided. +In this case, newlines and spaces are used to format the canonicalized output +in a more human-friendly way. See below for the precise requirements on where +this whitespace must appear. The `INDENT` is an optional nonnegative integer argument +that specifies the number of additional spaces to be output at the beginning of a line +for each increase in indentation level. The format of this argument must be +the same as for a nonnegative integer number in the JSON specification. +If `-p` is provided without any `INDENT`, then a default value of 4 is used. + +Note that the program reads data from `stdin` and writes transformed data +to `stdout`. Any other printout, such as diagnostic messages produced by the +program, are written to `stderr`. If the program runs without error, then it +will exit with the `EXIT_SUCCESS` status code; if any error occurs during the +execution of the program, then it will exit with the `EXIT_FAILURE` status code. + +> :nerd: `EXIT_SUCCESS` and `EXIT_FAILURE` are macros defined in `` which +> represent success and failure return codes respectively. + +> :nerd: `stdin`, `stdout`, and `stderr` are special I/O "streams", defined +> in ``, which are automatically opened at the start of execution +> for all programs, do not need to be reopened, and (almost always) should not +> be closed. + +The usage scenarios for this program are described by the following message, +which is printed by the program when it is invoked without any arguments: + +
+USAGE: bin/argo [-h] [-c|-v] [-p|-p INDENT]
+   -h       Help: displays this help menu.
+   -v       Validate: the program reads from standard input and checks whether
+            it is syntactically correct JSON.  If there is any error, then a message
+            describing the error is printed to standard error before termination.
+            No other output is produced.
+   -c       Canonicalize: once the input has been read and validated, it is
+            re-emitted to standard output in 'canonical form'.  Unless -p has been
+            specified, the canonicalized output contains no whitespace (except within
+            strings that contain whitespace characters).
+   -p       Pretty-print:  This option is only permissible if -c has also been specified.
+            In that case, newlines and spaces are used to format the canonical output
+            in a more human-friendly way.  For the precise requirements on where this
+            whitespace must appear, see the assignment handout.
+            The INDENT is an optional nonnegative integer argument that specifies the
+            number of additional spaces to be output at the beginning of a line for each
+            for each increase in indentation level.  If no value is specified, then a
+            default value of 4 is used.
+
+ +The square brackets indicate that the enclosed argument is optional. +The `-c|-v` means that one of `-c` or `-v` may be specified. +The `-p|-p INDENT` means that `-p` may be specified alone, or with an optional +additional argument `INDENT`. + +A valid invocation of the program implies that the following hold about +the command-line arguments: + +- All "positional arguments" (`-h`, `-c`, or `-v`) come before any optional arguments +(`-p`). +The optional arguments (well, there is only one) may come in any order after the positional ones. + +- If the `-h` flag is provided, it is the first positional argument after +the program name and any other arguments that follow are ignored. + +- If the `-h` flag is *not* specified, then exactly one of `-v` or `-c` +must be specified. + +- If `-p` is given, then it might or might not be followed by an `INDENT` argument. + If the `INDENT` argument is present, then it must represent a nonnegative integer + in the format allowed for integer numbers in the JSON specification. + +For example, the following are a subset of the possible valid argument +combinations: + +- `$ bin/argo -h ...` +- `$ bin/argo -v` +- `$ bin/argo -c -p` +- `$ bin/argo -c -p 8` + +> :scream: The `...` means that all arguments, if any, are to be ignored; e.g. +> the usage `bin/argo -h -x -y BLAHBLAHBLAH -z` is equivalent to `bin/argo -h`. + +Some examples of invalid combinations would be: + +- `$ bin/argo -p 1 -c` +- `$ bin/argo -v -c` +- `$ bin/argo -v -p 5` +- `$ bin/argo -z 20` + +> :scream: You may use only "raw" `argc` and `argv` for argument parsing and +> validation. Using any libraries that parse command line arguments (e.g. +> `getopt`) is prohibited. + +> :scream: Any libraries that help you parse strings are prohibited as well +> (`string.h`, `ctype.h`, etc). The use of `atoi`, `scanf`, `fscanf`, `sscanf`, +> and similar functions is likewise prohibited. *This is intentional and +> will help you practice parsing strings and manipulating pointers.* + +> :scream: You **MAY NOT** use dynamic memory allocation in this assignment +> (i.e. `malloc`, `realloc`, `calloc`, `mmap`, etc.). There is one function +> (`argo_append_char()`) provided for you that does the dynamic allocation +> required while accumulating the characters of a string or numeric literal. +> This function is in the file `const.c`, which you are not to modify. + +> :nerd: Reference for command line arguments: [https://beej.us/guide/bgc/html/#command-line-arguments](https://beej.us/guide/bgc/html/#command-line-arguments). + +**NOTE:** The `make` command compiles the `argo` executable into the `bin` folder. +All commands from here on are assumed to be run from the `hw1` directory. + +### **Required** Validate Arguments Function + +In `global.h`, you will find the following function prototype (function +declaration) already declared for you. You **MUST** implement this function +as part of the assignment. + +```c +int validargs(int argc, char **argv); +``` + +The file `validargs.c` contains the following specification of the required behavior +of this function: + +```c +/** + * @brief Validates command line arguments passed to the program. + * @details This function will validate all the arguments passed to the + * program, returning 0 if validation succeeds and -1 if validation fails. + * Upon successful return, the various options that were specified will be + * encoded in the global variable 'global_options', where it will be + * accessible elsewhere in the program. For details of the required + * encoding, see the assignment handout. + * + * @param argc The number of arguments passed to the program from the CLI. + * @param argv The argument strings passed to the program from the CLI. + * @return 0 if validation succeeds and -1 if validation fails. + * @modifies global variable "global_options" to contain an encoded representation + * of the selected program options. + */ +``` + +> :scream: This function must be implemented as specified as it will be tested +> and graded independently. **It should always return -- the USAGE macro should +> never be called from validargs.** + +The `validargs` function should return -1 if there is any form of failure. +This includes, but is not limited to: + +- Invalid number of arguments (too few or too many). + +- Invalid ordering of arguments. + +- A missing parameter to an option that requires one [doesn't apply to the + current assignment, since the parameter to `-p` is optional]. + +- Invalid parameter. A numeric parameter specfied with `-p` is invalid if + it does not conform to the format of a nonnegative integer as required by + the JSON specification. + +The `global_options` variable of type `int` is used to record the mode +of operation (i.e. encode/decode) of the program and associated parameters. +This is done as follows: + +- If the `-h` flag is specified, the most significant bit (bit 31) is 1. + +- If the `-v` flag is specified, the second-most significant bit (bit 30) + is 1. + +- If the `-c` flag is specified, the third-most significant bit (bit 29) + is 1. + +- If the `-p` flag is specified, the fourth-most significant bit (bit 28) + is 1. + +- The least significant byte (bits 7 - 0) records the number of spaces of + indentation per level specified with `-p`, or the default value (4) + if no value was specified with `-p`. If `-p` was not specified at all, + then this byte should be 0. + +If `validargs` returns -1 indicating failure, your program must call +`USAGE(program_name, return_code)` and return `EXIT_FAILURE`. +**Once again, `validargs` must always return, and therefore it must not +call the `USAGE(program_name, return_code)` macro itself. +That should be done in `main`.** + +If `validargs` sets the most-significant bit of `global_options` to 1 +(i.e. the `-h` flag was passed), your program must call `USAGE(program_name, return_code)` +and return `EXIT_SUCCESS`. + +> :nerd: The `USAGE(program_name, return_code)` macro is already defined for you +> in `argo.h`. + +If validargs returns 0, then your program must read input data from `stdin` +and (depending on the options supplied) write output data to `stdout`. +Upon successful completion, your program should exit with exit status `EXIT_SUCCESS`; +otherwise, in case of an error it should exit with exit status `EXIT_FAILURE`. + +Unless the program has been compiled for debugging (using `make debug`), +in a successful run that exits with `EXIT_SUCCESS` no other output may be produced +by the program. In an unsuccessful run in which the program exits with `EXIT_FAILURE` +the program should output to `stderr` a one-line diagnostic message that indicates +the reason for the failure. The program must not produce any other output than this +unless it has been compiled for debugging. + +> :nerd: Remember `EXIT_SUCCESS` and `EXIT_FAILURE` are defined in ``. +> Also note, `EXIT_SUCCESS` is 0 and `EXIT_FAILURE` is 1. + +### Example validargs Executions + +The following are examples of the setting of `global_options` and the +other global variables for various command-line inputs. +Each input is a bash command that can be used to invoke the program. + +- **Input:** `bin/argo -h`. **Setting:** `global_options=0x80000000` +(`help` bit is set, other bits clear). + +- **Input:** `bin/argo -v `. **Setting:** `global_options=0x40000000` +(mode is "validate"). + +- **Input:** `bin/argo -c -p 2`. **Setting:** `global_options=0x30000002` +(mode is "canonicalize", "pretty-print" has been specified with +indentation increment 2). + +- **Input:** `bin/argo -p 2 -c`. **Setting:** `global_options=0x0`. +This is an error case because the specified argument ordering is invalid +(`-p` is before `-c`). In this case `validargs` returns -1, leaving +`global_options` unset. + +# Part 2: Overview of the JSON Specification + +JSON ("JavaScript Object notation") is a standard format for data interchange +that is now commonly used in many areas of computing. +It was designed to be extremely simple to generate and parse and it in fact +achieves these goals: JSON syntax is about as simple as it gets for a +computer language that is actually used in the real world. +The syntax of JSON is defined by an +[ECMA standard](ECMA-404_2nd_edition_december_2017.pdf). +A summary that omits the scarier language from the standard document is given at +[www.json.org](https://www.json.org/json-en.html). +Most likely, you will only need to refer to this summary, but the full standard +document is here if you want to look at it. + +In order to understand the JSON syntax specification, you need to be able to +read the "railroad diagrams" that are used to formally specify it. +These diagrams are actually a graphical version of a *context-free grammar*, +which is a standard tool used for formally specifying all kinds of computer +languages. Actually, the white box inset on the right contains the full +grammar; the railroad diagrams only describe the portion of the syntax that +has any significant complexity. +Each of the railroad diagrams defines the syntax of a particular +"syntactic category", which is a set of strings having a similar format. +Examples of syntactic categories for JSON are "object", "array", +"value", "number", *etc*. +The paths along the "railroad tracks" in the diagram for one syntactic category +indicate the possibilities for forming a string in that category from strings +in other categories. +For example, the first diagram says that a string in the category "object" +always has an initial curly bracket `{`. This may be followed immediately by +a closing curly bracket `}` (the top "track"), or between the brackets there +may be something more complicated (a list of "members" -- the lower "track"). +By following the lower track, you find that there has to be "whitespace", +followed by a "string", followed by "whitespace", followed by a colon `:`, +followed by a "value". After the "value", it is possible to have the +closing curly bracket `}` or to loop back around and have another instance of +the same pattern that was just seen (a "member"). The path to loop back around +requires that a comma `,` appear before the next member, so this tells you +that the members in between the `{` and `}` are separated by commas. +The other diagrams are read similarly, and even if you have never seen these +before, with a little study they should be self-explanatory so I'm not going +to belabor the explanation further. + +Something that was not initially clear to me from just looking at the diagrams +was what the syntax of `true`, `false`, and `null` is. These are shown with +double quotes in the inset box on the right, but in fact, the "token" `true` +simply consists of the four letters: `t`, `r`, `u`, `e` without any quotes. +This is spelled out better in the ECMA standard document. + +The description of "character" in the inset box is also a bit mysterious +at first reading. A "character" is something that is permitted to occur within +a string literal. After staring at the description for awhile it becomes clear +that any Unicode code point except for (1) the "control characters" +whose code points range from U+0000 to U+001F, (2) the quote `"`, +and (3) the backslash '\' (they call it "reverse solidus"), may appear directly +representing themselves within a string literal. +In addition, "escape sequences" are permitted. An escape sequence starts +with a backslash `\`, which may be followed by one of the characters +`"`, '\\', '/', 'b', 'f', 'n', 'r', 't', or 'u'. After 'u' there are required +to appear exactly four hexadecimal digits, the letters of which may either +be in upper case or lower case. The meaning of `\"`, `\/`, `\b`, `\f` +`\n`, `\r`, and `\t` is as in a C string. The escape sequence `\/` represents +a single forward slash ("solidus") `/` (I do not know why this is in the +standard.) The meaning of `\uHHHH`, where `HHHH` are four hex digits is +the 16-bit Unicode code point from the "basic multilingual plane" whose +value is given by interpreting `HHHH` as a four-digit hexadecimal number. + +Although a Unicode code point outside the basic multilingual plane may +occur directly in a string literal, representing such by an escape requires +the use of a "surrogate pair" as indicated in the ECMA standard document. +Don't worry about this technicality. For this assignment, your implementation +will not have to handle full Unicode and UTF-8-encoded input. +You may assume instead that the input to your program will come as a sequence of +8-bit bytes, each of which directly represents a Unicode code point in the +range U+0000 to U+00FF (the first 128 code points correspond to ASCII codes, +and the meaning of the next 128 code points is defined by the Unicode standard). +Note that this means that we are *not* using the usual UTF-8 encoding to +represent Unicode as a sequence of 8-bit bytes. +As you will see when you look at the definitions of the data structures you +are to use, internally your program will use the 32-bit `int` +(typedef'ed as `ARGO_CHAR`) to represent a character. +This is enough bits to represent any Unicode code point, so there will +be no problem in taking the input bytes that you read in and storing them +internally as Unicode code points. Due to the limitation of the input encoding, +for us a string literal will not be able to directly contain any Unicode +code point greater than U+00FF. +Nevertheless, you will still be able to use escape sequences within +a string literal to represent Unicode code points in the basic multilingual +plane (from U+0000 to U+FFFF), because the escape sequence allows you +to specify the code point directly as four hexadecimal digits. +Since we will also output JSON as a sequence of 8-bit bytes, it will be +necessary to render any Unicode code points greater than U+00FF occuring +in a string literal using escapes. + +When reading a specification like this, it is helpful to have examples of +what is being defined. For this purpose, I have provided (in the `rsrc` +directory) some sample JSON files. These files all have the `.json` +extension. Some of these files are examples of what your program is supposed +to do when given other files as input. For example, the file `rsrc/numbers.json` +contains the following content. + +``` +{ + "0": 0, + "2147483648": 2147483648, + "-2147483649": 2147483649, + "0.0": 0.0, + "1": 1, + "789": 789, + "1.0": 1.0, + "-1.0": -1.0, + "1e3": 1e3, + "1E3": 1E3, + "1e-3": 1e-3, + "1.234": 1.234, + "-1.234": -1.234, + "1.234e3": 1.234e3, + "1.234e-3": 1.234e-3 +} +``` + +when your program is run as follows + +``` +$ bin/argo -c -p 2 < rsrc/numbers.json +``` + +it should produce the output in `rsrc/numbers_-c_-p_2.json`; namely + +``` +{ + "0": 0, + "2147483648": 2147483648, + "-2147483649": 2147483649, + "0.0": 0.0, + "1": 1, + "789": 789, + "1.0": 0.1e1, + "-1.0": -0.1e1, + "1e3": 0.1e4, + "1E3": 0.1e4, + "1e-3": 0.1000000000000000e-2, + "1.234": 0.1233999999999999e1, + "-1.234": -0.1233999999999999e1, + "1.234e3": 0.1233999999999999e4, + "1.234e-3": 0.1234000000000000e-2 +} +``` + +How this is supposed to happen is explained below. + +# Part 3: Implementation + +The header file `global.h` lists prototypes for functions you are +required to implement: + +```c +ARGO_VALUE *argo_read_value(FILE *); +int argo_read_string(ARGO_STRING *s, FILE *); +int argo_read_number(ARGO_NUMBER *n, FILE *); + +int argo_write_value(ARGO_VALUE *, FILE *); +int argo_write_string(ARGO_STRING *, FILE *); +int argo_write_number(ARGO_NUMBER *, FILE *); + +int validargs(int argc, char **argv); +``` + +The `validargs()` function has already been discussed above. +The `argo_read_value()` function reads JSON input from the specified stream +and returns an `ARGO_VALUE` data structure (as described below). +The `argo_read_string()` function takes a pointer to an `ARGO_STRING` +structure (which will be a sub-structure of an `ARGO_VALUE` structure), +as well as a `FILE *` pointer, and it reads a JSON string literal +(starting and ending with a quote `"`) from the input stream and stores +the content of the string (without the quotes, after handling escapes) +in the specified `ARGO_STRING` object. +The `argo_read_number()` function works similarly, except it reads +a JSON numeric literal and uses it to initialize an `ARGO_NUMBER` +structure. + +The `argo_write_value()` function takes an `ARGO_VALUE` data structure +and a `FILE *` pointer representing an output stream, and it writes +canonical JSON representing the specified value to the output stream. +The `argo_write_string()` function takes an `ARGO_STRING *` pointer +and a `FILE *` pointer and writes a JSON string literal to the output +stream (including quotes and escaping content that needs to be escaped). +The `argo_write_number()` function similarly takes an `ARGO_NUMBER *` +pointer and a `FILE *` pointer and it writes a JSON numeric literal +to the output stream. + +> :scream: Even though your final application will only ever read JSON input +> from `stdin` and write JSON output to `stdout`, the interfaces of these +> functions are designed to accept arbitrary streams as parameters. +> **You must not ignore these parameters.** Also, you must not assume that +> these streams are "seekable" and consequently you may not use the functions +> `fseek()` or `ftell()` in your code. + +Besides the general discussion below, more detailed specifications for the +required behavior of these functions are given in the comments preceding +the (non-functional) stubs in `argo.c`. Those specifications are mostly +not repeated here to avoid redundancy and possible inconsistencies between +this document and the specifications in `argo.c`. + +Of course, you will also have to make modifications to the `main()` function, +so that after calling `validargs()` it makes the calls to +`argo_read_value()` and `argo_write_value()` to perform the functions required +of the complete application. + +Since I want everybody to get the experience of designing and coding their +own implementation for this assignment, I have not spelled out any further +what other functions you will might to implement, but you will almost certainly +want to implement other functions. Note that the function interfaces +that have been specified, together with the problems that have to be solved +by these functions, give you clues about an implementation structure that +you might wish to consider. I will now discuss this briefly. + +The `argo_read_value()` function is supposed to read bytes of data from a +stream and attempt to *parse* them as a JSON "value" (which could be +an object, array, string, number, or one of the basic tokens `true`, +`false` or `null`). The result of this parsing process is a data structure +that represents the structure of the JSON in a form that is useful for +further processing. The specification of the syntax has a recursive +structure (*e.g.* an object contains members, members contain elements, which +can themselves contain values, and so on. A simple way to parse a string +according to a recursive specification like this is via a so-called +*recursive descent* parser. Basically, the parser will have a function +for each of the syntactic categories that appear in the syntax specification +(`argo_read_value()` is one such function). Each of these functions will +be called at a point where what is expected on the input stream is a string +belonging to the syntactic category handled by that function. +The function will read one or more characters from the input stream and, based +on what it sees, it will recursively call one or more of the other parser +functions. For example, the function responsible for parsing an "object" +might check that the next character in the input is a curly brace `{` +and then call the function responsible for parsing a "member". +Each parsing function will return a data structure that represents what it +has parsed. To build this data structure, each parsing function will +typically need make use of the data structures returned by the functions +that it called recursively. + +In general, each function in a recursive descent parser will need to examine +a certain amount of the input in order to determine what to do. This input +is called "look-ahead". One of the features of the JSON syntax that makes +it so easy to parse is that at most one character of look-ahead is ever +required in order to decide what to do next. For example, once we have +seen the `{` that starts an object, checking whether the next character is +a `}` or not is sufficient to tell whether we have to call functions +to parse members of the object, or whether the object is empty. +In implementing a parser like this, it generally simplifies the design +if you can "peek" at the look-ahead character without consuming it. +That way, when you call another function, it can assume that the input +stream is at the very start of what it is supposed to be trying to parse, +rather than having to keep track of what characters might already have +been been read by the caller. +You should use the `fgetc()` function from the C standard I/O library +to read each byte of data from the input stream. This function consumes +the byte of data from the input stream, but the standard I/O library +also provides a function `ungetc()` that allows you to "push back" a single +character of input. So you can achieve the effect of peeking one character +into the input stream by calling `fgetc()`, looking at the character returned, +and then using `ungetc()` to push it back into the stream if it is not +to be consumed immediately. In some cases, as you descend through recursive +calls, the same character might be examined and pushed back repeatedly. + +The recursive structure also dictates a natural form for the implementation +of the output function `argo_write_value()`: you can have one function +for each meaningful entity (*e.g.* "object", "member", "number") in the +JSON specification and these functions will call each other recursively +in order to traverse the data structure and emit characters to the output +stream. + +# Part 4: Data Structures + +The `argo.h` header file gives C definitions for the data structures you are +produce as the return values from `argo_read_value()` and as the arguments +to `argo_write_value()`. These data structures are basically trees. +The `ARGO_VALUE` structure is the central definition, which spells out what +information is in a node of such a tree. As the same `ARGO_VALUE` structure +is used to represent all the types of JSON values ("object", "array", "number", +*etc.*) it has a `type` field to indicate specifically what type of object +each individual instance represents. The possible types are defined by the +`ARGO_VALUE_TYPE` enumeration. Each node also has a `content` field, which +is where the actual content of the node is stored. The `content` field +is defined using the C `union` type, which allows the same region of memory +to be used to store different types of things at different times. +Depending on what is in the `type` field, exactly one of the `object`, `array`, +`string`, `number`, or `basic` subfields of `content` will be valid. +Except for `ARGO_BASIC`, which just defines a set of possible values, +each of these has its own structure definition, which are given as +`ARGO_OBJECT`, `ARGO_ARRAY`, `ARGO_STRING`, and `ARGO_NUMBER`. + +Besides the `type` and `content` fields, each `ARGO_VALUE` node contains +`next` and `prev` fields that point to other `ARGO_VALUES`. These fields +will be used to link each node with other "sibling" nodes into a list. +For example, a JSON "object" has a list of "members". +The JSON object will be represented by an `ARGO_VALUE` node having +`ARGO_OBJECT_TYPE` in its `type` field. The `content` field of this +object will therefore be used to hold an `ARGO_OBJECT` structure. +The `ARGO_OBJECT` structure has a single field: `member_list`, which +points to a "dummy" `ARGO_VALUE` structure used as the head of +a *circularly, doubly linked list* of members (more on this below). +Linked into this list will be `ARGO_VALUE` structures that represent +the members. The `next` and `prev` fields of these are used to chain +the members into a list: the `next` field points from a member to +the next member and the `prev` field points from a member to the previous +member. For `ARGO_VALUE` structures used to represent members of +an object, the `name` field will contain an `ARGO_STRING` structure +that represents the name of the member. + +JSON arrays are represented similarly to JSON objects: the array as a +whole is represented by an `ARGO_VALUE` structure whose `type` field +contains `ARGO_ARRAY_TYPE`. The `content` field will therefore be used +to hold an `ARGO_ARRAY` structure, which has an `element_list` field that +points to a "dummy" `ARGO_VALUE` structure at the head of a list of elements, +similarly to what was just described for for object members. +However, array elements don't have names, so the `name` field of each +array element will just be `NULL`. + +JSON strings are represented by the `ARGO_STRING` structure, which +has fields `capacity`, `length`, and `content`. These are used to +represent a dynamically growable string, similarly to the way +`ArrayList` is implemented in Java. +At any given time, the `content` field will either be `NULL` +(if the string is empty) or it will point to an array of `ARGO_CHAR` +elements, each of which represents a single Unicode code point. +The `capacity` field tells the total number of "slots" in this +array, whereas the `length` field tells how many of these are +actually used (*i.e.* it gives the current length of the string). +For this assignment, you don't have to actually be concerned with +the dynamic allocation -- that is performed by the function +`argo_append_char()` which has been implemented for you in `const.c`. +All you have to worry about is making sure that the fields +of and `ARGO_STRING` structure that you want to use have been +initialized to zero and then you can just call `argo_append_char()` +to build the string content. +A simple `for` loop using the `length` field as the upper limit +can then be used to traverse the `content` array of an `ARGO_STRING` +once it has been initialized. + +The `ARGO_NUMBER` structure is used to represent a number. +One of its fields is a `string_value` field, which is an `ARGO_STRING` +used to hold the digits and other characters that make up the +textual representation of the number. During parsing, characters +are accumulated in this field using `argo_append_char()` in the +same way that characters are accumulated for a string value. +The remaining fields (`int_value`, `float_value`) are used to store +an internal representation (either integer or floating-point) +of the value of the number, as well as flags (`valid_string`, +`valid_int`, `valid_float`) that tell which of the other fields +contain valid information. Note that a JSON number that contains +a fractional part or an exponent part will generally not be representable +in integer format, so the `valid_int` field should be zero and there +will be no useful information in the `int_value` field. Also, if an +`ARGO_NUMBER` is created internally, without parsing it from an input +stream, then a printable representation has not yet been computed, so the +`valid_string` field will be zero and the `string_value` field +will represent an empty string. + +To summarize, your `argo_read_value()` function will read bytes of +data from the specified input stream using `fgetc()`. +As it reads and parses the input, it will build up a tree of +`ARGO_VALUE` nodes to represent the structure of the JSON input. +The nodes of the resulting tree must satisfy the following requirements: + +- A node with `ARGO_OBJECT_TYPE` in its `type` field represents + a JSON "object". The `content` field then contains an `ARGO_OBJECT` + structure whose `member_list` field points to an `ARGO_VALUE` + node that is the head of a circular, doubly linked list of members. + Each member has a pointer to its associated name (an `ARGO_STRING`) + stored in the `name` field. + +- A node with `ARGO_ARRAY_TYPE` in its `type` field represents + a JSON "array". The `content` field then contains an `ARGO_ARRAY` + structure whose `element_list` field points to an `ARGO_VALUE` + node that is the head of a circular, doubly linked list of elements. + +- A node with `ARGO_STRING_TYPE` in its `type` field represents + a JSON "string" (without the enclosing quotes that appear in JSON + source). The `content` field then contains an `ARGO_STRING` + that represents the string. The `length` field of the `ARGO_STRING` + gives the length of the string and the `content` field points to + an array of `ARGO_CHAR` values that are the content of the string. + +- A node with `ARGO_NUMBER_TYPE` in its `type` field represents + a JSON "number". The `content` field then contains an `ARGO_NUMBER` + object that represents the number in various ways. + + * If the `valid_string` field is nonzero, then the `string_value` + field will contain an `ARGO_STRING` that holds the characters that + make up a printable/parseable representation of the number. + + * If the `valid_int` field is nonzero, then the `int_value` + field will contain the value of the number as a C `long`. + + * If the `valid_float` field is nonzero, then the `float_value` + field will contain the value of the number as a C `double`. + + If there is more than one representation of the number present, + then they are required to agree with each other (*i.e* represent + the same value). + +- A node with `ARGO_BASIC_TYPE` in its `type` field will have + a `content` field having a value of type `ARGO_BASIC` in its `basic` + field. This value will be one of `ARGO_TRUE`, `ARGO_FALSE`, + or `ARGO_NULL`. + +The `argo_read_string()` function will parse a JSON string literal +and store the string content into an `ARGO_STRING` object. +Characters in the input that are not control character and are not +one of the characters that must be escaped are simply appended directly +to the string content. However when a backslash `\` is encounted, +it is necessary to interpret it as the start of an *escape sequence* +that represents the character to be appended. These escape sequences +should be familiar, since they are essentially the same as those +used in Java as well as in C. + +The `argo_read_number()` function will parse a JSON numeric literal +and store into an `ARGO_NUMBER` object not only the sequence of +characters that constitute the literal, but also the value of the +number, either in integer format, in floating point format, or both. +In order to do this, you have to actually process the various digits +one at a time and calculate (using integer and/or floating point +arithemetic) the value that is represented. You have to carry out +this conversion yourself; you are not allowed to use any library +functions to do it. + +## Circular, Doubly Linked Lists + +As already stated, object members and array elements are to be stored as +circular, doubly linked lists of `ARGO_VALUE` structures, using a "dummy" +structure as a sentinel. Even though the sentinel has the same type +(*i.e.* `ARGO_VALUE`) as the elements of the list, it does not itself represent +an element of the list. The only fields used in the sentinel are the +`next` field, which points to the first actual element of the list, +and the `prev` field, which points to the last actual element of the list. +The list is "circular" because starting at the sentinel and following +`next` pointers will eventually lead back to the sentinel again. +Similarly, starting at the sentinel and following `prev` pointers will +eventually lead back to the sentinel. An empty list is represented +by a sentinel whose `next` and `prev` pointers point back to the sentinel +itself. You can read more about this type of data structure by searching, +e.g. Wikipedia for "circularly doubly linked list". The advantage of +using the sentinel is that all insertions and deletions are performed +in exactly the same way, without any edge cases for the first or last +element of the list. + +## Dynamic Storage + +There are two types of dynamic storage used by this program. +One of these is for the content of an `ARGO_STRING`. As already indicated +above, this is handled for you by the function `argo_append_char()` and you +do not have to worry about how it happens. +The other dynamic storage is for `ARGO_VALUE` structures. +You need a source of such structures while you are building the tree +that represents JSON. +As you are prohibited from declaring your own arrays in this +assignment, you will have to use one that we have already declared for you. +In `global.h` an array `argo_value_storage` has been defined for you, +together with an associated counter `argo_next_value`. You **must** use +this array as the source of `ARGO_VALUE` structures for building your +JSON trees. Use the `argo_next_value` counter to keep track of the +index of the first unused element of this array. +When you need an `ARGO_VALUE` structure, get a pointer to the first unused +element of the `argo_value_storage` array and increment `argo_next_value`. +Be sure to pay attention to the total number `NUM_ARGO_VALUES` of +elements of this array -- if you run off the end you will corrupt other +memory and your program will have unpredictable behavior. + +# Part 5: Canonical Output + +Your `argo_write_value()` function is supposed to traverse a data structure +such as that returned by `argo_read_value()` and it is supposed to output +JSON to the output stream. First of all, the JSON that you output has to +conform to the JSON standard, so that it can be parsed again to produce +exactly the same internal data structure. Beyond that, the JSON is supposed +to be "canonical", which means that it has been output in a standard way +that does not leave any possibility for variation. +Your canonical JSON output must always satisfy the following conditions: + +- An `ARGO_NUMBER` whose `valid_int` field is set is to be printed out as + an integer, without any fraction or exponent. + +- An `ARGO_NUMBER` whose `valid_float` field is set is to be printed out + with an integer and fractional part, as in the JSON specification. + The fractional part should be normalized to lie in the interval `[0.1, 1.0)`, + so that there is always just a single `0` digit before the decimal point + and the first digit after the decimal point is always nonzero. + An exponent of 0 is to be omitted completely and for positive exponents + the `+` sign is to be omitted. Exponents always start with lower-case `e`, + rather than upper-case `E`. + +- An `ARGO_STRING` is printed so that the following conditions are satisfied: + + * Characters (other than backslash `\` and quote `"`) having Unicode control + points greater than U+001F and less than U+00FF are to appear directly + in the string literal as themselves. This includes forward slash `/`. + + * Characters with Unicode code points greater than U+00FF + are to appear as escapes using `\u` and the appropriate hex digits, + which must be in lower case. + + * Control characters that have special escapes (`\n`, `\t`, *etc.*) must + be printed using those special escapes, not using the generic escape `\u` + with hex digits. + + +If the pretty-print option has not been specified, then your canonical JSON +output must satisfy the following condition: + +- There is no white space in the output, except for white space that occurs + within a string literal. + +If the pretty print option has been specified, then your canonical JSON output +will include white space according to the following rules: + +- A single newline is output after every `ARGO_VALUE` that is output at the + top-level (*i.e.* not as part of an object or array). + +- A single newline is output after every '{', '[', and ',' (except those in string + literals). + +- A single newline is output immediately after the last member of an object, + and immediately after the last element of an array. + +- Each newline is followed by a number of spaces that depends on the indentation + level of the value currently being printed. The indentation level is maintained + as follows: + + * The indentation level is 0 for a top-level JSON value. + + * The indentation level is increased by one just after a `{` or `[` has + been printed to start the list of members of an object or elements of + an array. + The indentation level decreases by one just after the last member or + element has been printed, so that the closing `}` or `]` is at the + previous indentation level + + * A single space is printed following each colon `:` that separates + the name of an object member from its value. + + The number of spaces following a newline is equal to the current indentation + level times the `INDENT` argument given with `-p`, or the default of `4` + if `-p` was specified without an `INDENT` argument. + +Note that canonicalization must be an "idempotent" operation, in the sense that +if canonical output previously produced is re-parsed and then re-output using +the same pretty-printing settings, then the new output should be identical +to the previous output. + +# Part 6: Strategy of Attack + +To make things a little easier for you in getting started on this assignment, +I have distributed with the basecode a library containing binary object +versions of my own implementations of `argo_read_value()` and `argo_write_value()`. +The `Makefile` has been constructed so that it will link your program against +the library I provided. As a result, if you comment out one or both of +these function in `argo.c`, my versions will be linked instead and you can +use them to work on the other parts of the assignment. Note that this library +will **not** be present during grading, so do not leave these functions +commented out or your code will not compile. + +Note that the functions whose interfaces have been specified will likely +be unit-tested. This means that their behavior should be completely determined +by their specified interface, which includes their parameters, return values, +and global variables defined in `global.h` (which you may **not** modify). +There should be no implicit assumption that any other functions have been or +will be called or that any particular variables have been set to any particular +values, except for the global variables defined in `global.h`. +So, for example, you may (and should) assume that when `argo_write_object()` +is called, the `global_options` variable has been set according to the desired +program options, but you may **not** assume that before `argo_write_object()` +has been called that some other function was called previously. + +My best guess as to the right attack strategy for this assignment is as follows: +First, work on the command-line argument processing (`validargs()`) and +make the changes to `main()` necessary to get the program to honor the command-line +arguments and perform the overall function that the application is supposed +to perform. +Next, start working on implementing `argo_write_value()`, using my version of +`argo_read_value()` as a source of data structures that you can use to increase +your understanding of pointers and the specific data structures that we are using +to represent JSON and, ultimately, as an aid to developing and testing your +implementation. +Finally, now that you have a clear understanding of the data structures you +are trying to produce work on implementing `argo_read_value()`, to parse +a stream of input bytes and produce such a data structure. I expect this part +of the assignment to be the most difficult. + +Note that the code that I wrote for `argo_read_value()` and `argo_write_value()` +is only about 800 lines in length. If you find your own code growing +much larger than that, you need to step back and think smarter about trying +to simplify your code. + +# Part 7: Running the Program + +The `argo` program always reads from `stdin` and possibly writes to `stdout`. +If you want the program to take input from a file or produce output to +a file, you may run the program using **input and output redirection**, +which is implemented by the shell. +A simple example of a command that uses such redirection is the following: + +``` +$ bin/argo -c < rsrc/numbers.json > numbers.out +``` + +This will cause the input to the program to be redirected from the text file +`rsrc/numbers.json` and the output from the program to be redirected to the +file `numbers.out`. +The redirection is accomplished by the shell, which interprets the `<` symbol +to mean "input redirection from a file" and the `>` symbol to mean +"output redirection to a file". It is important to understand that redirection +is handled by the shell and that the `bin/argo` program never sees any +of the redirection arguments; in the above example it sees only `bin/argo -c` +and it just reads from `stdin` and writes to `stdout`. + +Alternatively, the output from a command can be **piped** +to another program, without the use of a disk file. +This could be done, for example, by the following command: + +``` +$ bin/argo -c -p 2 < rsrc/package-lock.json | less +``` + +This sends the (rather lengthy) output to a program called `less`, +which display the first screenful of the output and then gives you the ability +to scan forward and backward to see different parts of it. +Type `h` at the `less` prompt to get help information on what you can do +with it. Type `q` at the prompt to exit `less`. + +Programs that read from standard input and write to standard output are +often used as components in more complex "pipelines" that perform multiple +transformations on data. + +For example, one way to test your implementation is by using one instance +of it to produce some output and testing to see if that output can be read by +another instance; *e.g.: + +``` +$ cat rsrc/package-lock.json | bin/argo -c | bin/argo -c -p 2 > p.out +``` + +Here `cat` (short for "concatenate") is a command that reads the files +specified as arguments, concatenates their contents, and prints the +concatenated content to `stdout`. In the above command, this output +is redirected through a pipe to become the input to `bin/argo -c`. +The output of `bin/argo -c` (which contains no whitespace) is then +sent to `bin/argo -c -p 2` for pretty printing. Finally, the pretty-printed +output is written to file `p.out`. Actually, the original input +file `rsrc/package-lock.json` is already canonical as defined here, +so in the end the file `p.out` should have exactly the same content +as `rsrc/package-lock.json`. One way to check this is to use the +`diff` comand (use `man diff` to read the manual page) to compare the +two files: + +``` +$ diff rsrc/package-lock.json p.out +$ +``` + +If `diff` exits silently, the files are identical. +Another command that would be useful on output with no whitespace +is the `cmp` command, which performes a byte-by-byte comparison of two files +(even files that contain raw binary data): + +``` +$ cmp rsrc/package-lock.json p.out +``` + +If the files have identical content, `cmp` exits silently. +If one file is shorter than the other, but the content is otherwise identical, +`cmp` will report that it has reached `EOF` on the shorter file. +Finally, if the files disagree at some point, `cmp` will report the +offset of the first byte at which the files disagree. +If the `-l` flag is given, `cmp` will report all disagreements between the +two files. + +## Unit Testing + +Unit testing is a part of the development process in which small testable +sections of a program (units) are tested individually to ensure that they are +all functioning properly. This is a very common practice in industry and is +often a requested skill by companies hiring graduates. + +> :nerd: Some developers consider testing to be so important that they use a +> work flow called **test driven development**. In TDD, requirements are turned into +> failing unit tests. The goal is then to write code to make these tests pass. + +This semester, we will be using a C unit testing framework called +[Criterion](https://github.com/Snaipe/Criterion), which will give you some +exposure to unit testing. We have provided a basic set of test cases for this +assignment. + +The provided tests are in the `tests/basecode_tests.c` file. These tests do the +following: + +- `validargs_help_test` ensures that `validargs` sets the help bit +correctly when the `-h` flag is passed in. + +- `validargs_validate_test` ensures that `validargs` sets the validate-mode bit +correctly when the `-v` flag is passed. + +- `validargs_canonicalize_test` ensures that `validargs` sets the canonicalize-mode bit +correctly when the `-c` flag is passed in. + +- `validargs_bits_test` ensures that `validargs` sets the decode-mode bit +correctly when the `-d` flag is passed in and that the value passed with `-b` +is correctly stored in the least-signficant byte of `global_options`. + +- `validargs_error_test` ensures that `validargs` returns an error when the `-p` +flag is supplied with the `-v` flag. + +- `help_system_test` uses the `system` syscall to execute your program through +Bash and checks to see that your program returns with `EXIT_SUCCESS`. + +- `argo_basic_test` performs a basic test of the canonicalization mode of the program. + +### Compiling and Running Tests + +When you compile your program with `make`, an `argo_tests` executable will be +created in your `bin` directory alongside the `argo` executable. Running this +executable from the `hw1` directory with the command `bin/argo_tests` will run +the unit tests described above and print the test outputs to `stdout`. To obtain +more information about each test run, you can use the verbose print option: +`bin/argo_tests --verbose=0`. + +The tests we have provided are very minimal and are meant as a starting point +for you to learn about Criterion, not to fully test your homework. You may write +your own additional tests in `tests/basecode_tests.c`, or in additional source +files in the `tests` directory. However, this is not required for this assignment. +Criterion documentation for writing your own tests can be +found [here](http://criterion.readthedocs.io/en/master/). + +Note that grades are assigned based on the number of our own test cases +(not given to you in advance) that your program passes. +So you should work on the assignments in such a way that whatever you do submit +will function. Code that is completely broken will not score any points, +regardless of how voluminous it might be or how long you might have spent on it. + +## Sample Input Files + +In the `rsrc` directory I have placed a few JSON input files for you to try +your code on. + +- `numbers.json`: A JSON file containing a single object with various + numbers as its members. This will exercise most (but probably not all) + of the interesting cases that come up in parsing and outputting numbers. + +- `strings.json`: A JSON file containing a single array with various + strings as its elements. These are intended to exercise most (but again, + probably not all) of the cases involving escape sequences in strings. + +- `package-lock.json`: This is a larger JSON file that I had lying around + which seemed to be a reasonable overall test. + +# Hand-in instructions + +**TEST YOUR PROGRAM VIGOROUSLY BEFORE SUBMISSION!** + +Make sure that you have implemented all the required functions specifed in `const.h`. + +Make sure that you have adhered to the restrictions (no array brackets, no prohibited +header files, no modifications to files that say "DO NOT MODIFY" at the beginning, +no functions other than `main()` in `main.c`) set out in this assignment document. + +Make sure your directory tree looks basically like it did when you started +(there could possibly be additional files that you added, but the original organization +should be maintained) and that your homework compiles (you should be sure to try compiling +with both `make clean all` and `make clean debug` because there are certain errors that can +occur one way but not the other). + +This homework's tag is: `hw1` + +`$ git submit hw1` + +> :nerd: When writing your program try to comment as much as possible. Try to +> stay consistent with your formatting. It is much easier for your TA and the +> professor to help you if we can figure out what your code does quickly! + diff --git a/hw2-doc/DebuggingRef.md b/hw2-doc/DebuggingRef.md new file mode 100644 index 0000000..a0b661e --- /dev/null +++ b/hw2-doc/DebuggingRef.md @@ -0,0 +1,1168 @@ +# CSE320 Fall 2018 - Debugging Reference Document + +In this document you will learn about [gdb](https://sourceware.org/gdb/current/onlinedocs/gdb/), the [preprocessor](https://gcc.gnu.org/onlinedocs/cpp/), [assert statements](http://man7.org/linux/man-pages/man3/assert.3.html), [valgrind](http://valgrind.org/docs/manual/manual.html), and a [linter](https://en.wikipedia.org/wiki/Lint_(software)) to debug your assignments in C. + +# Table of Contents + +1. [GDB](#gdb) +2. [The preprocessor](#the-preprocessor) +3. [assert.h](#assert.h) +4. [Valgrind](#valgrind) +5. [Linters](#linters) +6. [Conclusion](#conclusion) + +# GDB + +The tool `gdb` is a command line debugger for C. It helps you detect errors which may only occur during the execution time of your program. Let us start out with a simple program so you can learn how to set breakpoints, step through your program, and inspect the values of variables in it. + +```c +// debug.c +#include +#include + +int main(int argc, char *argv[]) { + int i = 0; char *string = "Hello, World!"; + printf("%s\n", string); + return EXIT_SUCCESS; +} +``` + +Compile the program as usual: +
+$ gcc debug.c
+$ ls
+a.out debug.c
+$
+
+ +Now run it with the gdb program. + +
+$ gdb a.out
+...
+Reading symbols from a.out...( no debugging symbols found )...done.
+(gdb)
+
+ +> :nerd: Notice the warning: **no debugging symbols found**. This means that the executable file does not contain the extra information needed to debug the code from the perspective of the C source file. You can only view the assembly language of the target architecture when you see this. + +> :nerd: In the above printout, the ellipses ... indicate that there is other text in its place. There is quite a bit of text that prints out when you first start `gdb`. This mainly describes legal information and information about the current version of the debugger installed. To help shorten the length of this document we have replaced this text with ... in the example printouts. + +If you were an x86_84 assembly expert this might be ok for you but most of us are not. The compiler offers the `-g` flag which will add symbols to the executable file created. We can use symbols from the source code to set our breakpoints. It also allows us to see the source code as we step through the file. Recompile the code with the `-g` flag to add these symbols. + +
+$ gcc -g debug.c
+$ ls
+a.out debug.c
+$
+
+ +Now when you run `gdb` it should locate the debugging symbols. + +
+ $ gdb a.out
+…
+Reading symbols from a.out...done.
+(gdb)
+
+ +> :nerd: The string `(gdb)` marks the prompt that is used to interact with the debugger. Any commands you type to instruct `gdb` on what to do should be typed at prompts that look like this. + +## Setting breakpoints + +Now that the file has debugging symbols in it, you can set breakpoints either using the name of a function or the line number. To set a break point in the `gdb` console you should type `break LINE_NUMBER`, where `LINE_NUMBER` is the line number that you want to set a breakpoint on. Alternatively, if you wanted `gdb` to start at the beginning of a function you can type `break FUNCTION_NAME`, where `FUNCTION_NAME` is the name of the function you want to debug. Let us use our sample program `debug.c` and start debugging from the `main` function. + +
+(gdb) break main
+Breakpoint 1 at 0x40053c: file debug.c, line 6.
+(gdb)
+
+ +or + +
+(gdb) break 6
+Breakpoint 1 at 0x40053c: file debug.c, line 6.
+(gdb)
+
+ +> :nerd: Both ways above set the breakpoint at the first line in `main`. + +When dealing with multiple files you may want to set a break point in another file you would do so with the gdb command: +
+(gdb) break file.c:8
+
+The format is FILENAME:LINE_NUMBER. + +Once you have set all the breakpoints in your program, you use the command `run` to start the program. The program will execute until it reaches the first breakpoint set in the program. + +
+(gdb) break main
+Breakpoint 1 at 0x40053c: file debug.c, line 6.
+(gdb) run
+Starting program: /home/cse320/a.out
+Breakpoint 1, main (argc=1, argv=0x7fffffffdfd8) at debug.c:6
+6 int i = 0; char *string = "Hello, World!";
+(gdb)
+
+ +In the above output, you should be able to tell that you hit *Breakpoint 1* which is the main function. It has the arguments argc which is equal to the value 1 and argv which starts at the address `0x7fffffffdfd8`. The statement `debug.c:6` tells you that the breakpoint occurred in the file debug.c on line 6. The next line shows you the source code for line 6 which is the line where the breakpoint is set. + +## Printing contents of variables + +GDB provides you with some helpful commands to inspect variables in your program. The easiest and probably most useful command is the `print` command. The print command will show the contents of a variable at the current moment in your program. It also accepts various formatting options to format the output in different ways. Let us print out the value of `i` and `string` at the current moment. + +
+6 int i = 0; char *string = "Hello, World!";
+(gdb) print i
+$1 = 32767
+(gdb) print string
+$2 = 0x0
+
+ +Notice that even though the source code shows that `i = 0` and `*string = "Hello, World!"`, the debugger is showing us that it currently is not. This happens because the current line has not actually executed yet so these assignments have not been made. The variables are set to these junk values because even though the code has not yet assigned values to these variables, the stack frame has already been created, and in the case of our x86 machines local variables are stored onto the stack. + +> :nerd: If you're interested in learning more about the stack frame used in x86_64 *nix environments, there are two great articles you can read. + +> 1. [Journey to the stack](http://duartes.org/gustavo/blog/post/journey-to-the-stack/) +> 2. [Stack frame layout on x86-64](http://eli.thegreenplace.net/2011/09/06/stack-frame-layout-on-x86-64/) + +## Navigating to the next line + +Now that you have inspected all that is interesting at this line, its time to move to the next line in the program. To do that you can either type `next` or `n` into the `gdb` prompt. + +
+(gdb) next
+7 printf("%s\n", string);
+(gdb)
+
+ +If you print out the values of `i` and `string` again they should be as described in the source code. + +
+7 printf("%s\n", string);
+(gdb) print i
+$3 = 0
+(gdb) print string
+$4 = 0x4005e4 "Hello, World!"
+(gdb)
+
+ +## Changing values at runtime + +There is another command called `set`, which allows you to change the value of a variable while the program is running. This can be useful if you want to force a certain condition to occur. Let's change the value of `string` while the program is running so when *line 7* executes it will no longer print out `Hello, World!`. + +
+(gdb) set var string = "Debugging is fun"
+$5 = 0x602010 "Debugging is fun"
+(gdb)
+
+ +Now, when *line 7* executes this program will print `Debugging is fun` instead of `Hello, World!` + +## Continuing the program + +At some point, you might get tired of typing `next` or `n` to reach your next breakpoint, or maybe you just want the program to run until the end. To do that, you use the command `continue` or `c` to make the program run to next breakpoint or completion. + +
+(gdb) continue
+Continuing.
+Debugging is fun
+[Inferior 1 (process 4943) exited normally]
+(gdb)
+
+ +> :nerd: Notice that the program printed out `Debugging is fun` instead of `Hello, World!`. + +## Restarting the program + +If you wish to restart the program from the start again, with the same breakpoints you just issue the run command again to do so. + +
+Debugging is fun
+[Inferior 1 (process 4943) exited normally]
+(gdb) run
+Starting program: /home/cse320/a.out
+
+Breakpoint 1, main (argc=1, argv=0x7fffffffdfd8) at debug.c:6
+6 int i = 0; char *string = "Hello, World!";
+(gdb)
+
+ +## Inspecting types + +There might be times when you forget the type of a variable while debugging. Luckily for us `gdb` has the command `ptype` which will print out the type of a variable. + +
+(gdb) ptype i
+type = int
+(gdb) ptype string
+type = char *
+(gdb) ptype &i
+type = int *
+(gdb)
+
+ +## Inspecting arrays + +You can use the `print` command to inspect the values of arrays as well. Let's test this out by viewing the contents of the `argv` array passed to the `main` function. + +
+(gdb) print argv
+$7 = (char **) 0x7fffffffdfd8
+(gdb) print argv[0]
+$8 = 0x7fffffffe318 "/home/cse320/a.out"
+(gdb) pytpe argv[0]
+type = char *
+(gdb) print argv[1]
+$9 = 0x0
+
+ +> :nerd: Every program has at least 1 command line argument passed to it. This argument is usually the path to the executable although it doesn’t have to be. IE: [execve](http://linux.die.net/man/2/execve) can change it. + +What happens if you try to print past `argv[1]`? On my system, it started printing out what seemed to be [environment variables](https://en.wikipedia.org/wiki/Environment_variable). + +
+(gdb) print argv[2]
+$10 = 0x7fffffffe33a "XDG_VTNR=7"
+(gdb) print argv[3]
+$11 = 0x7fffffffe345 "ORBIT_SOCKETDIR=/tmp/orbit-cse320"
+(gdb) print argv[11]
+$12 = 0x7fffffffe434 "TERM=xterm"
+
+ +This is in the realm of undefined behavior, but this happens because `main` actually has multiple prototypes (shown below). In this `gdb` session, the memory for these storage containers must be located right after `*argv[]` in memory. So when we started printing past the end of `argv` we actually started printing the values of the next contents in memory. + +```c +// some main prototypes +void main(void); +void main(); +int main(void); +int main(); +int main(int argc, char *argv[]); +int main(int argc, char **argv); +int main(int argc, char **argv, char **envp); +int main(int argc, char **argv, char **envp, char **apple); +``` + +The values that printed out most likely coincided with the contents of `char **envp`. You should try this out on different operating systems and see if anything different happens. + +## Exiting gdb + +To end your current session in `gdb`, type the command `quit` to exit. + +
+(gdb) quit
+$
+
+ +## Debugging slightly larger programs + +The program `debug.c` allowed us to showcase some of the basic things you can do with `gdb`, but it doesn't really let us show off all the capabilities of `gdb`. To help do that, we introduce the program `debug2.c` which adds slightly more complexity. + +```c +// debug2.c +1 #include +2 #include +3 +4 int factorial(int num); +5 +6 int main(int argc, char *argv[]) { +7 int value, result; +8 printf("Enter a positive number to compute the factorial of: "); +9 scanf("%d", &value); +10 // You should error check this... +11 result = factorial(value); +12 printf("The result of %d! is %d\n", value, result); +13 return EXIT_SUCCESS; +14 } +15 +16 int factorial(int num) { +17 if(num <= 1) { +18 return 1; +19 } else { +20 return num * factorial(num - 1); +21 } +22 } +``` + +Let us set a *breakpoint* on line 11 and start the program. When the program prompts you, enter the value **5** and press enter. + +
+gcc -Wall -Werror -g debug2.c
+$ gdb a.out
+...
+Reading symbols from a.out...done.
+(gdb) break 11
+Breakpoint 1 at 0x400659: file debug2.c, line 11.
+(gdb) run
+Starting program: /home/cse320/a.out
+Please enter a positive number to compute the factorial of: 5
+
+Breakpoint 1, main (argc=1, argv=0x7fffffffdd08) at debug2.c:11
+11		result = factorial(value);
+
+ +We are currently stopped at line 11 before calling the `factorial` function. If we want to advance **into** the factorial function how do we do that? If we try to use the command `next` we will get undesirable results. + +
+(gdb) next
+12		printf("The result of %d! is %d\n", value, result);
+(gdb)
+
+ +The program already executed `factorial` and you are now at the statement inside `main` on line 12. Instead you should use the command `step` or `s` so that you can step inside of the function `factorial`. + +
+Breakpoint 1, main (argc=1, argv=0x7fffffffdd08) at debug2.c:11
+11		result = factorial(value);
+(gdb) step
+factorial (num=5) at debug2.c:17
+17		if(num <= 1) {
+(gdb)
+
+ +## Inspecting arguments and locals + +While its possible to use the command `print` to inspect the arguments and local variables of a function, `gdb` also has the command `info` to expedite this process. The `info` command takes its own set of options. One such option is `args` which shows the name and value of all arguments passed into the function. + +
+(gdb) info args
+num = 5
+(gdb)
+
+ +> :nerd: Yes this function only has one argument so the `info` command in this case is no faster or better then just typing `print num`. But if you had a function which contained many arguments this command may be helpful. + +If your function had local variables you could instead provide the option `locals` to the `info` command. In this case though there are no local variables declared in this function so it tells us there are none. + +
+(gdb) info locals
+No locals.
+(gdb)
+
+ +## Viewing stack frames + +The debugger also gives us the ability to view how many stack frames down we currently are. To do that you type the command `backtrace`. + +
+(gdb) backtrace
+#0  factorial (num=5) at debug2.c:17
+#1  0x0000000000400663 in main (argc=1, argv=0x7fffffffdd08) at debug2.c:11
+
+ +Currently, you can see that, at this point in the programs execution, there are two stack frames created. Another awesome feature involving stack frames and `gdb` is that you can switch to a different stack frame and inspect the variables in that frame by using the `frame` command. + +
+(gdb) frame 1
+#1  0x0000000000400663 in main (argc=1, argv=0x7fffffffdd08) at debug2.c:11
+11		result = factorial(value);
+(gdb) info locals
+value = 5
+result = 32767
+
+ +> :nerd: Here we were able to use the `info locals` command to see the current value of all the local variables in the frame 1 (which is the `main` function). Do you know why the variable `result` is currently set to a seemingly arbitrary number? + +Now let's switch back to `frame 0` and set a breakpoint at line 17 and use the `continue` command. + +
+(gdb) backtrace
+#0  factorial (num=5) at debug2.c:17
+#1  0x0000000000400663 in main (argc=1, argv=0x7fffffffdd08) at debug2.c:11
+(gdb) frame 0
+#0  factorial (num=5) at debug2.c:17
+17		if(num <= 1) {
+(gdb) break 17
+Breakpoint 2 at 0x4006a3: file debug2.c, line 17.
+(gdb) continue
+Continuing.
+
+Breakpoint 2, factorial (num=4) at debug2.c:17
+17		if(num <= 1) {
+(gdb)
+
+ +If you use the `backtrace` command again, you should now see that your program has created 3 stack frames. + +
+(gdb) backtrace
+#0  factorial (num=4) at debug2.c:17
+#1  0x00000000004006bd in factorial (num=5) at debug2.c:20
+#2  0x0000000000400663 in main (argc=1, argv=0x7fffffffdd08) at debug2.c:11
+(gdb)
+
+ +If you're really dying to see more information, there's an option you can provide to `backtrace` called `full` which prints out all the argument and local variable values for each frame that currently exists. + +
+(gdb) backtrace full
+#0  factorial (num=4) at debug2.c:17
+No locals.
+#1  0x00000000004006bd in factorial (num=5) at debug2.c:20
+No locals.
+#2  0x0000000000400663 in main (argc=1, argv=0x7fffffffdd08) at debug2.c:11
+        value = 5
+        result = 32767
+(gdb)
+
+ +To get the most information about the current frame you are in, you can also use the command `info frame` to see lots of information about the current frame. + +
+(gdb) info frame
+Stack level 0, frame at 0x7fffffffdbe0:
+ rip = 0x4006a3 in factorial (debug2.c:17); saved rip = 0x4006bd
+ called by frame at 0x7fffffffdc00
+ source language c.
+ Arglist at 0x7fffffffdbd0, args: num=4
+ Locals at 0x7fffffffdbd0, Previous frame's sp is 0x7fffffffdbe0
+ Saved registers:
+  rbp at 0x7fffffffdbd0, rip at 0x7fffffffdbd8
+
+ +> :nerd: You should notice that the register names here may not be familiar to you. This is because the this command `info frame` was executed on a machine running the [x86-64](https://en.wikipedia.org/wiki/X86-64) architecture which has a different set of registers with different names than MIPS. If you were to execute the same command on Sparky it would also look different because Sparky runs on a [SPARC](https://en.wikipedia.org/?title=SPARC) architecture. + +## Removing breakpoints + +Now that we have played around with `backtrace` and changing frames, let's remove the breakpoint and get back to main so we can see the result of `factorial`. To remove breakpoints, `gdb` has the `clear` command. Typing `clear` by itself removes **all** breakpoints, but we only want to remove the one currently in `factorial`. If we use the command `clear` and then the name of the function it will clear the breakpoint inside of that function. + +
+(gdb) break 12
+Breakpoint 3 at 0x400666: file debug2.c, line 12.
+(gdb) clear factorial
+Deleted breakpoint 2
+(gdb) continue
+Continuing.
+
+Breakpoint 3, main (argc=1, argv=0x7fffffffdd08) at debug2.c:12
+12		printf("The result of %d! is %d\n", value, result);
+(gdb)
+
+ +## Finishing up debugging + +The result of `5!` should be `120`. We can first check this by using the `print` command to inspect the value of `result`. + +
+(gdb) print result
+$1 = 120
+(gdb)
+
+ +Using the `continue` command one last time the program should run to completion and we can then quit by typing `quit`. + +
+(gdb) continue
+Continuing.
+The result of 5! is 120
+[Inferior 1 (process 28948) exited normally]
+(gdb) quit
+$
+
+ +## More gdb resources + +The `gdb` debugger is a very powerful tool and many books and tutorials have been created to help show all of its power. If you are interested in learning even more things that can be done with `gdb`, we refer you to the following online resources. + +1. [official docs](https://sourceware.org/gdb/current/onlinedocs/gdb/) +2. [useful commands](http://stackoverflow.com/questions/1471226/most-tricky-useful-commands-for-gdb-debugger) +3. [learning C with gdb](https://www.recurse.com/blog/5-learning-c-with-gdb) +4. [gdb interactive shell](http://blog.pclewis.com/2010/03/tip-using-gdb-as-an-interactive-c-shell/) +5. [debugging with gdb](http://betterexplained.com/articles/debugging-with-gdb/) +6. [quick guide to gdb](http://beej.us/guide/bggdb/) +7. [gdb debugger tutorial](http://www.unknownroad.com/rtfm/gdbtut/gdbtoc.html) + +## Slightly better gdb tool + +Occasionally, its nice to be able to see the source code while debugging your programs in C. You can either provide a flag to make `gdb` display the source code or use the program `cgdb` which is slightly nicer. + +### cgdb + +The program `cgdb` will split your terminal into two windows. One showing your source code and the other showing the `gdb` input window. + +![cgdb](https://cgdb.github.io/images/screenshot_debugging.png) + +It is possible to switch between the panes by pressing `esc` to switch to the code pane. In the code pane you can press the `up` and `down` arrow keys to look at the code. When you get to a line where you would want to set a breakpoint you can press the `spacebar` to toggle a break point on and off. + +When your done looking at the code you can press `i` to go back to the `gdb` input window. All the commands in `cgdb` are exactly the same as gdb. + +More commands can be found [here](https://cgdb.github.io/docs/cgdb.html#Controlling-CGDB). + + +### gdb -tui + +The tool `gdb` itself also has a graphical mode which you can start by using the flag `-tui`. + +
+$ gdb -tui a.out
+
+![gdb-tui](http://i.imgur.com/GFyJrdp.png) + +Reference [here](https://sourceware.org/gdb/onlinedocs/gdb/TUI.html) for commands to control the panes. + +### Visual debugger limitations + +Unfortunately, both these tools when in the "gui" mode suffer from the same issue. If you want to debug a program that accepts input from the command line, they do weird things and often get stuck and freeze. To see your source code (or at least part of it) you can type `l` and it will display some of the source code currently around where your program is currently stopped at. + +
+Breakpoint 1, main (argc=1, argv=0x7fffffffdd08) at debug2.c:6
+6	int main(int argc, char *argv[]) {
+(gdb) l
+1	#include <stdio.h>
+2	#include <stdlib.h>
+3
+4	int factorial(int num);
+5
+6	int main(int argc, char *argv[]) {
+7		int value, result;
+8		printf("Enter a positive number to compute the factorial of: ");
+9		scanf("%d", &value);
+10		// You should error check this...
+(gdb)
+
+ +Its also possible for you to redirect input from a file into your `gdb` session. To do this you first need to create a text file that has the commands you want to type. The program `debug2.c` expects the user to type a positive number at line 9. If we wanted to use `cgdb` or `gdb -tui`, we could make the text file `input.txt` that has the following contents: + +
+5
+
+ +Now start a new `gdb` debugging session and when you type `run` you can redirect the the file using the `<` operator. + +
+gcc -Wall -Werror -g debug2.c
+$ gdb -tui a.out
+...
+Reading symbols from a.out...done.
+(gdb) break 11
+Breakpoint 1 at 0x400659: file debug2.c, line 11.
+(gdb) run < input.txt
+...
+
+ +### rr: lightweight recording & deterministic debugging + +There's a tool currently being developed by Mozilla called [rr](http://rr-project.org/). This tool is described as an enhancement to `gdb`. It allows us to record the execution of a program, and then you can replay this execution over and over. Mozilla provides a nice [slideshow](https://mozilla.github.io/rr/rr.html) explaining the technology. A simple getting started tutorial can be found [here](https://github.com/mozilla/rr/wiki/Usage) as well. + +This concludes our very short `gdb` introduction. + +# The preprocessor + +So far, you have learned a bit about using `gcc` and `gdb` to help debug and fix your programs. While these are very useful tools, sometimes the easiest way to see what is happening in your program is to just use print statements. We shouldn't just put `printf` all over our program though. We might not always want to see these print outs (way too much information for normal operation). One possible solution to this is passing a command line argument that turns debugging on and off. This might be an acceptable solution but it will clutter our code with lots of if statements to check if debugging is enabled or not, make our binary larger when we don't want debugging enabled, etc. Instead, we will use some preprocessor tricks to give us some logging statements when we **compile with** the flag `-DDEBUG`. When we **compile without** the flag `-DDEBUG`, these debugging statements will **not be added** to the executable and therefore will not print. + +## Logging with the preprocessor + +Somewhere at the top of your source file put the following line of code defining our debug logging macro + +```c +#define debug(msg) printf("DEBUG: %s", msg) +``` + +Then in your program use the `debug` macro + +```c +// debug.c +#include +#include + +#define debug(msg) printf("DEBUG: %s", msg) + +int main(int argc, char *argv[]) { + debug("Hello, World!\n"); + return EXIT_SUCCESS; +} +``` + +Then compile your program and run it. + +
+$ gcc debug.c
+$ ./a.out
+DEBUG: Hello, World!
+
+ +Great! You just created your first [preprocessor macro](https://gcc.gnu.org/onlinedocs/cpp/Macros.html). Unfortunately, this is no better than just adding a print statement. Let's fix that! + +The preprocessor has *if*, *elif*, and *else* [directives](https://gcc.gnu.org/onlinedocs/gcc-3.0.2/cpp_4.html) that that we can use to control what gets added during compilation. Let's create an *if* directive that will include a section of code if `DEBUG` is defined within the preprocessor. + +```c +//debug.c +#include +#include + +#define debug(msg) printf("DEBUG: %s", msg) + +int main(int argc, char *argv[]) { + #ifdef DEBUG + debug("Debug flag was defined\n"); + #endif + printf("Hello, World!\n"); + return EXIT_SUCCESS; +} +``` + +When we compile this program, it will check to see if `#define DEBUG` was defined in our program. Let's test this out. + +
+$ gcc debug.c
+$ ./a.out
+Hello, World!
+
+ +Cool! The debug message didn't print out. Now let's define `DEBUG` during the compilation process, and run the program again. + +> :nerd: The flag `-D` stands for define. + +
+$ gcc -DDEBUG debug.c
+$ ./a,out
+DEBUG: Debug flag was defined
+Hello, World!
+
+ +Here you can see that debug was defined so that extra code between `#ifdef DEBUG` and `#endif` was included. This technique will work for certain situations, but if we have a lot of logging messages in our program this will quickly clutter our code and make it unreadable. Fortunately we can do better. + +Instead of doing `#ifdef DEBUG` all over our program, we can instead do `#ifdef DEBUG` around our `#define debug` macro. + +```c +// debug.c +#include +#include + +#ifdef DEBUG + #define debug(msg) printf("DEBUG: %s", msg) +#endif + +int main(int argc, char *argv[]) { + debug("Debug flag was defined\n"); + printf("Hello, World!\n"); + return EXIT_SUCCESS; +} +``` + +There is an issue with this, but let's try to compile the program. + +
+$ gcc -DDEBUG debug.c
+$ ./a.out
+DEBUG: Debug flag was defined
+Hello, World!
+
+ +Cool it works. Now let's try to compile it without defining `-DDEBUG`. + +
+$ gcc debug.c
+/tmp/cc6F04VW.o: In function `main':
+debug.c:(.text+0x1a): undefined reference to `debug'
+collect2: error: ld returned 1 exit status
+
+ +Whoops. What happened here? Well when we used `-DDEBUG` the `debug` macro was defined, so it worked as expected. When we don't compile with `-DDEBUG` the `#define debug` is never declared in our file so it is never substituted in our program. Since we used `debug` in the middle of our code, the preprocessor and compiler have no idea what `debug` symbol is, so it fails. Luckily this is easy to fix. We simply have to add another case to our preprocessor *if*, *else* statement to handle this case. + +```c +// debug.c +#include +#include + +#ifdef DEBUG + #define debug(msg) printf("DEBUG: %s", msg) +#else + #define debug(msg) +#endif + +int main(int argc, char *argv[]) { + debug("Debug flag was defined\n"); + printf("Hello, World!\n"); + return EXIT_SUCCESS; +} +``` + +Here we tell the preprocessor to replace any occurrences of `debug(msg)` with nothing. So now, when we don't compile with `-DDEBUG`, the preprocessor simply replaces `debug("Debug flag was defined\n")` with an empty space. Let's compile again. + +
+$ gcc debug.c
+$ ./a.out
+Hello, World!
+
+ +Cool. Now we can embed `debug` macros all over our program that look like normal functions. There are still a few more cool tricks we can do to make this better. + +The preprocessor has a few special macros defined called `__LINE__`, `__FILE__`, and `__FUNCTION__`. These macros will be replaced by the preprocessor to evaluate to the *line number* where the macro is called, the *filename* that the macro is called in, and the *function name* that the macro is called in. Let's play with this a bit. + +```c +// debug.c +#include +#include + +#ifdef DEBUG + #define debug(msg) printf("DEBUG: %s:%s:%d %s", __FILE__, __FUNCTION__, __LINE__, msg) +#else + #define debug(msg) +#endif + +int main(int argc, char *argv[]) { + debug("Debug flag was defined\n"); + printf("Hello, World!\n"); + return EXIT_SUCCESS; +} +``` + +Let's compile this program and run. + +
+gcc -DDEBUG debug.c
+$ ./a.out
+DEBUG: debug.c:main:11 Debug flag was defined
+Hello, World!
+
+ +As you can see all the `__FILE__`, `__FUNCTION__`, and `__LINE__` were replaced with the corresponding values for when debug was called in the program. Pretty cool, but we can still do even better! Normally when we want to print something, we use `printf()`, the format specifiers and variable arguments to print useful information. With our current setup though we can't do that. Fortunately for us the preprocessor offers up a `__VA_ARGS__` macro which we can use to accomplish this. + +> :nerd: I want to point out that the syntax for this gets a bit crazy and hard to understand (complex preprocessor stuff is a bit of a black art). I'll try my best to describe it but you may need to do some more googling if the below explanation is not sufficient. + +```c +// debug.c +#include +#include + +#ifdef DEBUG + #define debug(fmt, ...) printf("DEBUG: %s:%s:%d " fmt, __FILE__, __FUNCTION__, __LINE__, ##__VA_ARGS__) +#else + #define debug(fmt, ...) +#endif + +int main(int argc, char *argv[]) { + debug("Program has %d args\n", argc); + printf("Hello, World!\n"); + return EXIT_SUCCESS; +} +``` + +First, let's compile and run the program and see the results. +
+$ gcc -DDEBUG debug.c
+$ ./a.out
+DEBUG: debug.c:main:11 Program has 1 args
+Hello, World!
+$ gcc debug.c
+$ ./a.out
+Hello, World!
+
+The macro works as expected, but let's try to explain it a bit. + +First, we changed the definition of the macro to be `#define debug(fmt, ...)`. The first argument `fmt` is the format string that we normally define for `printf` and `...` is the way to declare a macro that accepts a variable number of arguments. + +Next we have `"DEBUG: %s:%s:%d " fmt`. This was very confusing to me at first, but the C compiler can [concatenate string literals](http://msdn.microsoft.com/en-us/library/c7bt45zf.aspx) that are next to each other. So if `fmt` was the string `"crazy %d concatenation"` then this statements evaluates to `"DEBUG: %s:%s:%d crazy %d concatenation"`. Then, we have our predefined preprocessor macros that are used for the string `"DEBUG: %s:%s:%d "`, and then we reach this next confusing statement: `, ##__VA_ARGS__`. The macro `__VA_ARGS__` will expand into the variable arguments provided to the debug statement, but then we have this crazy `, ##`. This is a hack for allowing no arguments to be passed to the debug macro, Ex. `debug("I have no varargs")`. If we didn't do this, the previous debug statement would throw an warning/error during the compilation process as it would expect a `__VA_ARGS__` value. + +This is one of the many interesting things we can use the C preprocessor for. I'll leave you off with a final example of another interesting example that you might see in real code. Have fun and experiment! + +```c +// debug.c +#include +#include + +#ifdef DEBUG + #define debug(fmt, ...) do{printf("DEBUG: %s:%s:%d " fmt, __FILE__, __FUNCTION__, __LINE__, ##__VA_ARGS__);}while(0) + #define info(fmt, ...) do{printf("INFO: %s:%s:%d " fmt, __FILE__, __FUNCTION__, __LINE__, ##__VA_ARGS__);}while(0) +#else + #define debug(fmt, ...) + #define info(fmt, ...) do{printf("INFO: " fmt, ##__VA_ARGS__);}while(0) +#endif + +int main(int argc, char *argv[]) { + debug("Program has %d args\n", argc); + printf("Hello, World!\n"); + info("Info statement. Should print out always, with varying amounts of information depending on flags provided.\n"); + return EXIT_SUCCESS; +} +``` +
+$ gcc debug.c
+$ ./a.out
+Hello, World!
+INFO: Info statement. Should print out always, with varying amounts of information depending on flags provided.
+$ gcc -DEBUG debug.c
+$ ./a.out
+DEBUG: debug.c:main:13 Program has 1 args
+Hello, World!
+INFO: debug.c:main:15 Info statement. Should print out always, with varying amounts of information depending on flags provided.
+
+ +> :nerd: Some programmers like to wrap the code in macros with a `do{ /* some code here */ }while(false)` loop. They do this because if your macro is made up of multiple statements, it will force you to add `;` to all the statements in the do while loop. Then, you still have to terminate this macro with a `;` when you use it, which makes it seem like a normal function in your C code. + +> **tldr;** It is a way to prevent yourself from making stupid mistakes with macros. + +# assert.h + +It is typical for functions of a library to go through some rigorous testing stages to prove that it produces correct results. While there are many more complete third party solutions, the C standard library provides us with `assert.h`, which gives us access to a single macro known as [assert](http://man7.org/linux/man-pages/man3/assert.3.html). While very basic, we can use the `assert` macro to test the inputs and result of our functions. This is yet another form of debugging to assist us in writing correct programs in C. + +Consider a very simple C program `assert.c`. + +```c +// assert.c +#include +#include + +int bad_len(const char *str); + +int main(int argc, char *argv[]) { + char *str = NULL; + int length = bad_len(str); + printf("strlen: %d\n", length); + return EXIT_SUCCESS; +} + +int bad_len(const char *str) { + int len = 0; + if(str != NULL) { + /* Some fancy code for computing the length */ + len = 4; + } + return len; +} +``` + +
+$ gcc -Wall -Werror assert.c
+$ ./a.out
+strlen: 0
+
+ +The program didn't fail to compile or run but what exactly went wrong? Why is the length zero? We could just step through the debugger, or make some print statements, but instead let's use assert. Let's modify the following snippet of code by adding in an `assert` statement. + +```c +// assert.c +#include +#include +#include + +int bad_len(const char *str); + +int main(int argc, char *argv[]) { + char *str = NULL; + int length = bad_len(str); + printf("strlen: %d\n", length); + return EXIT_SUCCESS; +} + +int bad_len(const char *str) { + int len = 0; + assert(str != NULL); /* ADD THIS LINE */ + if(str != NULL) { + /* Some fancy code for computing the length */ + len = 4; + } + return len; +} +``` + +Compile this program and run it again. + +
+$ gcc -Wall -Werror assert.c
+$ ./a.out
+a.out: assert.c:16: bad_len: Assertion `str != ((void *)0)' failed.
+Aborted (core dumped)
+$
+
+ +We can see that `str` was equal to `NULL` (for some reason I thought it was not) which is why our length returned was 0. Let's fix our program so it passes the assert statement. + + +```c +// assert.c +#include +#include +#include + +int bad_len(const char *str); + +int main(int argc, char *argv[]) { + char *str = "Debugging in C is so awesome."; + int length = bad_len(str); + printf("strlen: %d\n", length); + return EXIT_SUCCESS; +} + +int bad_len(const char *str) { + int len = 0; + assert(str != NULL); + if(str != NULL) { + /* Some fancy code for computing the length */ + len = 4; + } + return len; +} +``` + +Compile this program and run it again. + +
+$ gcc -Wall -Werror assert.c
+$ ./a.out
+strlen: 4
+$
+
+ +Hmm. We got rid of the first error, but this is not the right return value. In a more complicated program we might not notice that the return value is incorrect if we didn't print it out. Let's add an `assert` statement to check the return value of `bad_len`. + +```c +// assert.c +#include +#include +#include + +int bad_len(const char *str); + +int main(int argc, char *argv[]) { + char *str = "Debugging in C is so awesome."; + int length = bad_len(str); + assert(length == 29); + printf("strlen: %d\n", length); + return EXIT_SUCCESS; +} + +int bad_len(const char *str) { + int len = 0; + assert(str != NULL); + if(str != NULL) { + /* Some fancy code for computing the length */ + len = 4; + } + return len; +} +``` + +Compile this program and run it again. + +
+$ gcc -Wall -Werror assert.c
+$ ./a.out
+a.out: assert.c:10: main: Assertion `length == 29' failed.
+Aborted (core dumped)
+
+ +Here we can now see very quickly that the length returned by our function was not the correct value. After fixing `bad_len`, compiling and running again we can get the correct value. We know now that for this input our function is working correctly. + +### Best practices with assert + +While `assert` is helpful, this is not a good solution for actually error checking inputs in our programs. The `assert` functionality can be turned off during the compilation process using the flag `-DNDEBUG` (similar to how you could turn your debug prints using the preprocessor). While grading your assignments, we may compile with `-DNDEBUG`. Because of this, it is not a good idea to use `assert` as the only way to validate the inputs to your functions. You should use assert under the following conditions: + +* Use assert to check for conditions that should be impossible to happen in your program. +* Use assert during the debugging process to check inputs, but also check them correctly as well. + * i.e., In the example we use assert to fail quickly if the input is NULL, but we still explicitly check to see if the input is `NULL` in our normal logic. +* Do not use assert when the operation used inside of it has side effects. See the two examples below: + +**Bad Side Effects Example:** +```c +assert(++x > MIN_VALUE); +if(x > MIN_VALUE) { + /* Do something */ +} +``` + +**Good Side Effects Example:** +```c +++x; +assert(x > MIN_VALUE); +if(x > MIN_VALUE) { + /* Do something */ +} +``` + +> :nerd: Using assert is not required, but it can assist you in tracking down nasty bugs in your program. Usually bugs that happen after changing a seemingly working function or when your function hits an edge case you might have never thought of. + +# Valgrind + +[Valgrind](http://valgrind.org/docs/manual/manual.html) is another tool which you can use for detecting errors at runtime. What differentiates Valgrind from `gdb` is that it can detect things such as memory leaks, out of bounds memory access, use of unintialized variables, open file handles, etc. These type of errors are not directly reported in `gdb` and errors such as "out of bounds memory access" may just create what is known as a `segmentation fault` and crash your program without you ever knowing what happens. It's also not an interactive process like `gdb` (although it is possible to use `gdb` and Valgrind together at the same time). + +Valgrind takes advantage of the debugging symbols added to your program when compiled with the `-g` flag. You should continue to compile your program with the `-g` when you use Valgrind to test your program. + +## Detecting memory leaks + +Valgrind is typically used for detecting memory leaks in a program. Let's create a naive program `valgrind.c` with an obvious memory leak and run it through Valgrind and see what happens. + +```c +// valgrind.c +#include +#include + +int main(int argc, char *argv[]) { + int *var = malloc(sizeof(int)); + // Assign a value + *var = 4; + // Print the value + printf("The value of var is: %d\n", *var); + return EXIT_SUCCESS; +} +``` + +
+$ gcc -Wall -Werror -g valgrind.c
+$
+
+ +The program compiles with no errors but clearly there is a memory leak in this program (We ask for memory using malloc but never free it). To run this program through Valgrind type the following command: + +
+$ valgrind ./a.out
+==30532== Memcheck, a memory error detector
+==30532== Copyright (C) 2002-2013, and GNU GPL'd, by Julian Seward et al.
+==30532== Using Valgrind-3.10.0 and LibVEX; rerun with -h for copyright info
+==30532== Command: ./a.out
+==30532==
+The value of var is: 4
+==30532==
+==30532== HEAP SUMMARY:
+==30532==     in use at exit: 4 bytes in 1 blocks
+==30532==   total heap usage: 1 allocs, 0 frees, 4 bytes allocated
+==30532==
+==30532== LEAK SUMMARY:
+==30532==    definitely lost: 4 bytes in 1 blocks
+==30532==    indirectly lost: 0 bytes in 0 blocks
+==30532==      possibly lost: 0 bytes in 0 blocks
+==30532==    still reachable: 0 bytes in 0 blocks
+==30532==         suppressed: 0 bytes in 0 blocks
+==30532== Rerun with --leak-check=full to see details of leaked memory
+==30532==
+==30532== For counts of detected and suppressed errors, rerun with: -v
+==30532== ERROR SUMMARY: 0 errors from 0 contexts (suppressed: 0 from 0)
+$
+
+ +Here, you can see in the green text that this program had 1 memory allocation, but no frees. Then, in `LEAK SUMMARY` you can see in the red text that the program definitely lost 4 bytes. So these lines tell us there was a memory leak but from what? + +If you look at the Valgrind printout, it provides us with instructions, which we highlighted in purple, to use the flag (``--leak-check=full``) to get more details about the errors detected by Valgrind. + +
+$ valgrind --leak-check=full ./a.out
+==30535== Memcheck, a memory error detector
+==30535== Copyright (C) 2002-2013, and GNU GPL'd, by Julian Seward et al.
+==30535== Using Valgrind-3.10.0 and LibVEX; rerun with -h for copyright info
+==30535== Command: ./a.out
+==30535==
+The value of var is: 4
+==30535==
+==30535== HEAP SUMMARY:
+==30535==     in use at exit: 4 bytes in 1 blocks
+==30535==   total heap usage: 1 allocs, 0 frees, 4 bytes allocated
+==30535== 
+==30535== 4 bytes in 1 blocks are definitely lost in loss record 1 of 1
+==30535==    at 0x4C2ABA0: malloc (in /usr/lib/valgrind/vgpreload_memcheck-amd64-linux.so)
+==30535==    by 0x40059E: main (valgrind.c:5)
+==30535==
+==30535== LEAK SUMMARY:
+==30535==    definitely lost: 4 bytes in 1 blocks
+==30535==    indirectly lost: 0 bytes in 0 blocks
+==30535==      possibly lost: 0 bytes in 0 blocks
+==30535==    still reachable: 0 bytes in 0 blocks
+==30535==         suppressed: 0 bytes in 0 blocks
+==30535==
+==30535== For counts of detected and suppressed errors, rerun with: -v
+==30535== ERROR SUMMARY: 1 errors from 1 contexts (suppressed: 0 from 0)
+$
+
+ +The flag `--leak-check=full` gives us the extra content in red which tells us that this memory leak originates from *line 5* which is where we originally used `malloc` to allocate the integer. + +## Out of bounds access + +Let's fix the memory leak by adding a free call and instead access a value out of bounds. + +```c +// valgrind.c +#include +#include + +int main(int argc, char *argv[]) { + int *var = malloc(sizeof(int)); + // Assign a value + *var = 4; + // Print the value + printf("The value of var is: %d\n", *var); + printf("Out of bounds access: %s\n", argv[-1]); + free(var); + return EXIT_SUCCESS; +} +``` + +
+$ gcc -Wall -Werror -g valgrind.c
+$
+
+ +Still no errors produced. Now let's run the program through Valgrind again. + +
+$ valgrind --leak-check=full ./a.out
+==30552== Memcheck, a memory error detector
+==30552== Copyright (C) 2002-2013, and GNU GPL'd, by Julian Seward et al.
+==30552== Using Valgrind-3.10.0 and LibVEX; rerun with -h for copyright info
+==30552== Command: ./a.out
+==30552==
+The value of var is: 4
+
+==30552== Invalid read of size 1
+==30552==    at 0x4E82AF3: vfprintf (vfprintf.c:1642)
+==30552==    by 0x4E8B588: printf (printf.c:33)
+==30552==    by 0x400620: main (valgrind.c:10)
+==30552==  Address 0x1 is not stack'd, malloc'd or (recently) free'd 
+==30552==
+==30552==
+==30552== Process terminating with default action of signal 11 (SIGSEGV)
+==30552==  Access not within mapped region at address 0x1
+==30552==    at 0x4E82AF3: vfprintf (vfprintf.c:1642)
+==30552==    by 0x4E8B588: printf (printf.c:33)
+==30552==    by 0x400620: main (valgrind.c:10)
+==30552==  If you believe this happened as a result of a stack
+==30552==  overflow in your program's main thread (unlikely but
+==30552==  possible), you can try to increase the size of the
+==30552==  main thread stack using the --main-stacksize= flag.
+==30552==  The main thread stack size used in this run was 8388608.
+Out of bounds access: ==30552== 
+==30552== HEAP SUMMARY:
+==30552==     in use at exit: 4 bytes in 1 blocks
+==30552==   total heap usage: 1 allocs, 0 frees, 4 bytes allocated
+==30552==
+==30552== LEAK SUMMARY:
+==30552==    definitely lost: 0 bytes in 0 blocks
+==30552==    indirectly lost: 0 bytes in 0 blocks
+==30552==      possibly lost: 0 bytes in 0 blocks
+==30552==    still reachable: 4 bytes in 1 blocks
+==30552==         suppressed: 0 bytes in 0 blocks
+==30552== Reachable blocks (those to which a pointer was found) are not shown.
+==30552== To see them, rerun with: --leak-check=full --show-leak-kinds=all
+==30552==
+==30552== For counts of detected and suppressed errors, rerun with: -v
+==30552== ERROR SUMMARY: 1 errors from 1 contexts (suppressed: 0 from 0)
+Segmentation fault (core dumped)
+$
+
+ +Accessing the array out of bounds in this case created a segmentation fault (this is lucky; sometimes these things don't crash and gives you bugs which you can't reproduce) and crashed the program before free could be called (hence the still reachable 4 bytes). The text colored in red shows Valgrind reporting that an out of bounds memory access has occurred. + +This is a short intro to Valgrind but should be all you need for creating correct programs with no memory leaks in C. + +# Linters + +There exists a tool called a [linter](https://en.wikipedia.org/wiki/Lint_(software)) which you may find very helpful. Similar to syntax errors, if you have a linter installed in your editor as you type code, you will see errors. These errors will not be syntax errors. Instead, they alert you about memory leaks, and unused variables, etc. The following is a list some popular linters for a few popular text editors. + +* [Sublime text](http://www.sublimetext.com/) + * [sublimelinter](http://www.sublimelinter.com/en/latest/) - Required to be installed + * Pick one of the following + * [sublimelinter-clang](https://packagecontrol.io/packages/SublimeLinter-contrib-clang) (This one seems to be better) + * [sublimelinter-cpp](https://github.com/SublimeLinter/SublimeLinter-cppcheck) +* [Atom](https://atom.io/) + * [linter-clang](https://atom.io/packages/linter-clang) (This one seems to be better) + * [linter-gcc](https://atom.io/packages/linter-gcc) +* [Vim](http://www.vim.org/) + * [syntastic](http://www.vim.org/scripts/script.php?script_id=2736) + +A linter may exist for your favorite editor, but you will have to find it for yourself. + +# Conclusion + +Now that you have read about the many different techniques to debug your program, you should now apply these things to all homework assignments for the rest of the semester. Now move on to the HW2 Assignment. You will want to refer back to this document for reference as many of the errors and issues you will encounter are described in this document. diff --git a/hw2-doc/README.md b/hw2-doc/README.md new file mode 100644 index 0000000..030ec09 --- /dev/null +++ b/hw2-doc/README.md @@ -0,0 +1,420 @@ +# Homework 2 Debugging and Fixing - CSE 320 - Spring 2022 +#### Professor Eugene Stark + +### **Due Date: Friday 3/4/2022 @ 11:59pm** + +# Introduction + +In this assignment you are tasked with updating an old piece of +software, making sure it compiles, and that it works properly +in your VM environment. + +Maintaining old code is a chore and an often hated part of software +engineering. It is definitely one of the aspects which are seldom +discussed or thought about by aspiring computer science students. +However, it is prevalent throughout industry and a worthwhile skill to +learn. Of course, this homework will not give you a remotely +realistic experience in maintaining legacy code or code left behind by +previous engineers but it still provides a small taste of what the +experience may be like. You are to take on the role of an engineer +whose supervisor has asked you to correct all the errors in the +program, plus add additional functionality. + +By completing this homework you should become more familiar +with the C programming language and develop an understanding of: + +- How to use tools such as `gdb` and `valgrind` for debugging C code. +- Modifying existing C code. +- C memory management and pointers. +- Working with files and the C standard I/O library. + +## The Existing Program + +Your goal will be to debug and extend an old program called `par`, +which was written by Adam M. Costello and posted to Usenet in 1993. +I have rearranged the original source code and re-written the `Makefile` +to conform to the format we are using for the assignments in this course. +Besides a bug that was present in the original version, I have introduced +a few additional bugs here and there to make things more interesting +and educational for you :wink:. +Although you will need to correct these bugs in order to make the program +function, they do not otherwise change the program behavior from what +the author intended. + +The `par` program is a simple paragraph reformatter. It is basically +designed to read text from the standard input, parse the text into +paragraphs, which are delimited by empty lines, chop each paragraph up +into a sequence of words (forgetting about the original line breaks), +choose new line breaks to optimize some criteria that are designed to +produce a pleasing result, and the finally output the paragraph with +the new line breaks. There are several parameters that can be set +which affect the result: the width of the output text, the length of +a "prefix" and a "suffix" to be prepended and appended to each output line, +a parameter "hang", which affects the default value of "prefix", and +a boolean parameter "last", which affects the way the last line of a +paragraph is treated. + +What you have to do is to first get the program to compile (for the most part, +I did not modify the original code, which requires some changes for it +to compile cleanly with the compiler and settings we are using). +Then, you need to test the program and find and fix the bugs that prevent it +from functioning properly. Some of the bugs existed in the original version and +some I introduced for the purposes of this assignment. +Finally, you will make some modifications to the program. + +As you work on the program, limit the changes you make to the minimum necessary +to achieve the specified objectives. Don't rewrite the program; +assume that it is essentially correct and just fix a few compilation errors and +bugs as described below. You will likely find it helpful to use `git` for this (I did). +Make exploratory changes first on a side branch (*i.e.* not the master branch), +then when you think you have understood the proper changes that need to be made, +go back and apply those changes to the master branch. Using `git` will help you +to back up if you make changes that mess something up. + +### Getting Started - Obtain the Base Code + +Fetch base code for `hw2` as you did for the previous assignments. +You can find it at this link: +[https://gitlab02.cs.stonybrook.edu/cse320/hw2](https://gitlab02.cs.stonybrook.edu/cse320/hw2). + +Once again, to avoid a merge conflict with respect to the file `.gitlab-ci.yml`, +use the following command to merge the commits: + +
+  git merge -m "Merging HW2_CODE" HW2_CODE/master --strategy-option=theirs
+
+ + > :nerd: I hope that by now you would have read some `git` documentation to find + > out what the `--strategy-option=theirs` does, but in case you didn't :angry: + > I will say that merging in `git` applies a "strategy" (the default strategy + > is called "recursive", I believe) and `--strategy-option` allows an option + > to be passed to the strategy to modify its behavior. In this case, `theirs` + > means that whenever a conflict is found, the version of the file from + > the branch being merged (in this case `HW2_CODE/master`) is to be used in place + > of the version from the currently checked-out branch. An alternative to + > `theirs` is `ours`, which makes the opposite choice. If you don't specify + > one of these options, `git` will leave conflict indications in the file itself + > and it will be necessary for you to edit the file and choose the code you want + > to use for each of the indicated conflicts. + +Here is the structure of the base code: + +
+.
+├── .gitlab-ci.yml
+└── hw2
+    ├── doc
+    │   ├── par.1
+    │   ├── par.doc
+    │   └── protoMakefile
+    ├── hw2.sublime-project
+    ├── include
+    │   ├── buffer.h
+    │   ├── debug.h
+    │   ├── errmsg.h
+    │   └── reformat.h
+    ├── Makefile
+    ├── rsrc
+    │   ├── banner.txt
+    │   ├── gettysburg.txt
+    │   └── loremipsum.txt
+    ├── src
+    │   ├── buffer.c
+    │   ├── errmsg.c
+    │   ├── main.c
+    │   ├── par.c
+    │   └── reformat.c
+    ├── test_output
+    │   └── .git-keep
+    └── tests
+        ├── basecode_tests.c
+        ├── rsrc
+        │   ├── banner.txt
+        │   ├── basic.in -> gettysburg.txt
+        │   ├── basic.out
+        │   ├── blank_lines.txt
+        │   ├── EOF.in
+        │   ├── EOF.out
+        │   ├── gettysburg.txt
+        │   ├── loremipsum.txt
+        │   ├── prefix_suffix.in -> banner.txt
+        │   ├── prefix_suffix.out
+        │   ├── valgrind_leak.in -> gettysburg.txt
+        │   ├── valgrind_leak.out
+        │   ├── valgrind_uninitialized.err
+        │   ├── valgrind_uninitialized.in -> loremipsum.txt
+        │   └── valgrind_uninitialized.out
+        ├── test_common.c
+        └── test_common.h
+
+ +The `src` directory contains C source code files `buffer.c`. `par.c`, `reformat.c`, +and `errmsg.c`, which were part of the original code. In addition, I have added +a new file `main.c`, with a single `main()` function that simply calls +`original_main()` in `par.c`. This is to satisfy our requirement (for Criterion) +that `main()` is the only function in `main.c`. + +The `include` directory contains C header files `buffer.h`, `reformat.h`, and +`errmsg.h`, which were part of the original source code. I have also added our +`debug.h` header file which may be of use to you. + +The `doc` directory contains documentation files that were part of the original +distribution of `par`. The file `par.1` is in the format traditionally used +for Unix manual pages. This file `par.` is intended to be processed with the +the formatting program `nroff` with argument `-man`; for example: +`nroff -man doc/par.1 | less` could be used to format and view its contents. + +The `tests` directory contains C source code (in file `basecode_tests.c`) for some Criterion +tests that can help guide you toward bugs in the program. These are not guaranteed +to be complete or exhaustive. The `test_common.c` and `test_common.h` contain auxiliary code +used by the tests. The subdirectory `tests/rsrc` contains input files and reference output files +that are used by the tests. +The `par` program was not designed to be particularly conducive to unit testing, +so all the tests we will make (including the tests used in grading) will be so-called +"black box" tests, which test the input-output behavior of the program running as a +separate process from the test driver. +The `test_common.c` file contains helper functions for launching an instance of `par` +as a separate process, redirecting `stdin` from an input file, collecting the +output produced on `stdout` and `stderr`, checking the exit status of the program, +and comparing the output against reference output. + +The `test_output` directory is a "dummy" directory which is used to hold the output +produced when you run the Criterion tests. Look there if you want to understand, +for example, why the tests reported that the output produced by your program was +not as expected. + +Before you begin work on this assignment, you should read the rest of this +document. In addition, we additionally advise you to read the +[Debugging Document](DebuggingRef.md). One of the main goals of this assignment +is to get you to learn how to use the `gdb` debugger, so you should right away +be looking into how to use this while working on the tasks in the following sections. + +# Part 1: Debugging and Fixing + +You are to complete the following steps: + +1. Clean up the code; fixing any compilation issues, so that it compiles + without error using the compiler options that have been set for you in + the `Makefile`. + Use `git` to keep track of the changes you make and the reasons for them, so that you can + later review what you have done and also so that you can revert any changes you made that + don't turn out to be a good idea in the end. + +2. Fix bugs. + + Run the program, exercising the various options, and look for cases in which the program + crashes or otherwise misbehaves in an obvious way. We are only interested in obvious + misbehavior here; don't agonize over program behavior that might just have been the choice + of the original author. You should use the provided Criterion tests to help point the way, + though they are not guaranteed to be exhaustive. + +3. Use `valgrind` to identify any memory leaks or other memory access errors. + Fix any errors you find. + + Run `valgrind` using a command of the following form: + +
+      $ valgrind --leak-check=full --show-leak-kinds=all --undef-value-errors=yes [PAR PROGRAM AND ARGS]
+    
+ + Note that the bugs that are present will all manifest themselves in some way + either as incorrect output, program crashes or as memory errors that can be + detected by `valgrind`. It is not necessary to go hunting for obscure issues + with the program output. + Also, do not make gratuitous changes to the program output, as this will + interfere with our ability to test your code. + + > :scream: The author of this program was pretty fastidious about freeing memory before + > exiting the program. Once you have fixed the bugs, the program should exit without + > any type of memory leak reported by `valgrind`, including memory that is "still reachable" + > at the time of exit. "Still reachable" memory corresponds to memory that is in use + > when the program exits and can still be reached by following pointers from variables + > in the program. Although some people consider it to be untidy for a program + > to exit with "still reachable" memory, it doesn't cause any particular problem. + > For the present program, however, there should not be any "still reachable" memory. + + > :scream: You are **NOT** allowed to share or post on PIAZZA + > solutions to the bugs in this program, as this defeats the point of + > the assignment. You may provide small hints in the right direction, + > but nothing more. + +# Part 2: Changes to the Program + +## Rewrite/Extend Options Processing + +The basecode version of `par` performs its own *ad hoc* processing of command-line options. +This is likely due to the fact that there did not exist a commonly accepted library +package for performing this function at the time the program was written. +However, as options processing is a common function that is performed by most programs, +and it is desirable for programs on the same system to be consistent in how they interpret +their arguments, there have been more elaborate standardized libraries that have been written +for this purpose. In particular, the POSIX standard specifies a `getopt()` function, +which you can read about by typing `man 3 getopt`. A significant advantage to using a +standard library function like `getopt()` for processing command-line arguments, +rather than implementing *ad hoc* code to do it, is that all programs that use +the standard function will perform argument processing in the same way +rather than having each program implement its own quirks that the user has to remember. + +For this part of the assignment, you are to replace the original argument-processing +code in `main()` by code that uses the GNU `getopt` library package. +In addition to the POSIX standard `getopt()` function, the GNU `getopt` package +provides a function `getopt_long()` that understands "long forms" of option +arguments in addition to the traditional single-letter options. +In your revised program, `main()` should use `getopt_long()` to traverse the +command-line arguments, and it should support the following option syntax +(in place of what was originally used by the program): + + - `--version` (long form only): + Print the version number of the program. + + - `-w WIDTH` (short form) or `--width WIDTH` (long form): + Set the output paragraph width to `WIDTH`. + + - `-p PREFIX` (short form) or `--prefix PREFIX` (long form): + Set the value of the "prefix" parameter to `PREFIX`. + + - `-s SUFFIX` (short form) or `--suffix SUFFIX` (long form): + Set the value of the "suffix" parameter to `SUFFIX`. + + - `-h HANG` (short form) or `--hang HANG` (long form): + Set the value of the "hang" parameter to `HANG`. + + - `-l LAST` (short form) or either `--last` or + `--no-last` (long form): + Set the value of the boolean "last" parameter. + For the short form, the values allowed for `LAST` should be either + `0` or `1`. + + - `-m MIN` (short form) or either `--min` or `--no-min` (long form). + Set the value of the boolean "min" parameter. + For the short form, the values allowed for `MIN` should be either + `0` or `1`. + +You will probably need to read the Linux "man page" on the `getopt` package. +This can be accessed via the command `man 3 getopt`. If you need further information, +search for "GNU getopt documentation" on the Web. + +> :scream: You MUST use the `getopt_long()` function to process the command line +> arguments passed to the program. Your program should be able to handle cases where +> the (non-positional) flags are passed IN ANY order. Make sure that you test the +> program with prefixes of the long option names, as well as the full names. + +## Revise the Error Message Scheme + +The original program uses a very *ad hoc* scheme for error-message reporting: +if an error occurs, a string describing the error is stored into a global +character array `errmsg` with a hard-coded maximum size. (This hard-coded +size has an occurrence in the `fprintf()` format string in `par.c`, +which creates undesirable implicit coupling between `par.c` and `errmsg.c`.) +At various points in the program, the existence of an error condition is checked +by looking to see if the first character of the error message string is a null +character `'\0'`. Before the program terminates, if an error message exists, +then it is printed and the program exits with an error status, otherwise it exits +with a success indication. + +Your job is to revise the error message scheme to make it somewhat more general +and to eliminate the hard-coded limitation on the length of an error message. +In particular, you should replace the interface defined in `errmsg.h` by the +following function prototypes (exactly as shown): + +```c +/** + * @brief Set an error indication, with a specified error message. + * @param msg Pointer to the error message. The string passed by the caller + * will be copied. + */ +void set_error(char *msg); + +/** + * @brief Test whether there is currently an error indication. + * @return 1 if an error indication currently exists, 0 otherwise. + */ +int is_error(); + +/** + * @brief Issue any existing error message to the specified output stream. + * @param file Stream to which the error message is to be issued. + * @return 0 if either there was no existing error message, or else there + * was an existing error message and it was successfully output. + * Return non-zero if the attempt to output an existing error message + * failed. + */ +int report_error(FILE *file); + +/** + * Clear any existing error indication and free storage occupied by + * any existing error message. + */ +void clear_error(); +``` + +The global array `errmsg` should be removed from `errmsg.h` and replaced +by a pointer variable declared as `static char *` in `errmsg.c`. +The functions whose prototypes are given above should be implemented so +that there is no fixed maximum imposed on the length of an error message. +This means that error messages should be dynamically allocated on the +heap (for example, using `strdup()`). The implementation should take care +not to leak any memory used for error messages; for example if a new error +message is set when one already exists. Before exiting, the program should +call `clear_error()` to cause any existing error message to be freed. + +# Part 3: Testing the Program + +For this assignment, you have been provided with a basic set of +Criterion tests to help you debug the program. + +In the `tests/basecode_tests.c` file, there are five test examples. +You can run these with the following command: + +
+    $ bin/par_tests
+
+ +To obtain more information about each test run, you can supply the +additional option `--verbose=1`. +You can also specify the option `-j1` to cause the tests to be run sequentially, +rather than in parallel using multiple processes, as is the default. +The `-j1` flag is necessary if the tests could interfere with each other in +some way if they are run in parallel (such as writing the same output file). +You will probably find it useful to know this; however the basecode tests have +been written so that they each use output files named after the test and +(hopefully) will not interfere with each other. + +The tests have been constructed so that they will point you at most of the +problems with the program. +Each test has one or more assertions to make sure that the code functions +properly. If there was a problem before an assertion, such as a "segfault", +the test will print the error to the screen and continue to run the +rest of the tests. +The basecode test cases check the program operation by reading input from +a pre-defined input file, redirecting `stdout` and `stderr` to output files, +and comparing the output produced against pre-defined reference files. +Some of the tests use `valgrind` to verify that no memory errors are found. +If errors are found, then you can look at the log file that is left behind +(in the `test_output` directory) by the test code. +Alternatively, you can better control the information that `valgrind` provides +if you run it manually. + +The tests included in the base code are not true "unit tests", because they all +run the program as a black box using `system()`. +You should be able to follow the pattern to construct some additional tests of +your own, and you might find this helpful while working on the program. +You are encouraged to try to write some of these tests so that you learn how +to do it. Note that in the next homework assignment unit tests will likely +be very helpful to you and you will be required to write some of your own. +Criterion documentation for writing your own tests can be found +[here](http://criterion.readthedocs.io/en/master/). + + > :scream: Be sure that you test non-default program options to make sure that + > the program does not crash or otherwise misbehave when they are used. + +# Hand-in Instructions + +Ensure that all files you expect to be on your remote repository are committed +and pushed prior to submission. + +This homework's tag is: `hw2` + +
+$ git submit hw2
+
diff --git a/hw3-doc/README.md b/hw3-doc/README.md new file mode 100644 index 0000000..6d9aee8 --- /dev/null +++ b/hw3-doc/README.md @@ -0,0 +1,1079 @@ +# Homework 3 Dynamic Memory Allocator - CSE 320 - Spring 2022 +#### Professor Eugene Stark + +### **Due Date: Friday 3/25/2022 @ 11:59pm** + +We **HIGHLY** suggest that you read this entire document, the book chapter, +and examine the base code prior to beginning. If you do not read the entire +document before beginning, you may find yourself doing extra work. + +> :scream: Start early so that you have an adequate amount of time to test +your program! + +> :scream: The functions `malloc`, `free`, `realloc`, `memalign`, `calloc`, +> etc., are **NOT ALLOWED** in your implementation. If any of these functions, +> or any other function with similar functionality is found in your program, +> you **will receive a ZERO**. + +**NOTE:** In this document, we refer to a word as 2 bytes (16 bits) and a memory +row as 4 words (64 bits). We consider a page of memory to be 1024 bytes (1 KB) + +# Introduction + +You must read **Chapter 9.9 Dynamic Memory Allocation Page 839** before +starting this assignment. This chapter contains all the theoretical +information needed to complete this assignment. Since the textbook has +sufficient information about the different design strategies and +implementation details of an allocator, this document will not cover this +information. Instead, it will refer you to the necessary sections and pages in +the textbook. + +## Takeaways + +After completing this assignment, you will have a better understanding of: +* The inner workings of a dynamic memory allocator +* Memory padding and alignment +* Structs and linked lists in C +* [errno](https://linux.die.net/man/3/errno) numbers in C +* Unit testing in C + +# Overview + +You will create an allocator for the x86-64 architecture with the following features: + +- Free lists segregated by size class, using first-fit policy within each size class, + augmented with a set of "quick lists" holding small blocks segregated by size. +- Immediate coalescing of large blocks on free with adjacent free blocks; + delayed coalescing on free of small blocks. +- Boundary tags to support efficient coalescing, with footer optimization that allows + footers to be omitted from allocated blocks. +- Block splitting without creating splinters. +- Allocated blocks aligned to "double memory row" (16-byte) boundaries. +- Free lists maintained using **last in first out (LIFO)** discipline. +- Obfuscation of block headers and footers to detect heap corruption and attempts to + free blocks not previously obtained via allocation. + +You will implement your own versions of the **malloc**, **realloc**, +and **free** functions. + +You will use existing Criterion unit tests and write your own to help debug +your implementation. + +## Free List Management Policy + +Your allocator **MUST** use the following scheme to manage free blocks: +Free blocks will be stored in a fixed array of `NUM_FREE_LISTS` free lists, +segregated by size class (see **Chapter 9.9.14 Page 863** for a discussion +of segregated free lists). +Each individual free list will be organized as a **circular, doubly linked list** +(more information below). +The size classes are based on a power-of-two geometric sequence (1, 2, 4, 8, 16, ...), +according to the following scheme: +The first free list (at index 0) holds blocks of the minimum size `M` +(where `M = 32` for this assignment). +The second list (at index 1) holds blocks of size `(M, 2M]`. +The third list (at index 2) holds blocks of size `(2M, 4M]`. +The fourth list holds blocks whose size is in the interval `(4M, 8M]`. +The fifth list holds blocks whose size is in the interval `(8M, 16M]`, +and so on. This pattern continues up to the interval `(128M, 256M]`, +and then the last list (at index `NUM_FREE_LISTS-1`; *i.e.* 9) +holds blocks of size greater than `256M`. +Allocation requests will be satisfied by searching the free lists in increasing +order of size class. + +## Block Placement Policy + +When allocating memory, use a **segregated fits policy**, modified by the use of quick lists +as follows. When an allocation request is received, the quick list containing blocks of the +appropriate size is first checked to try to quickly obtain a block of exactly the right size. +If there is no quick list of that size (quick lists are only maintained for a fixed set of +the smallest block sizes), or if there is a quick list but it is empty, then the request will +be satisfied from the main free lists. + +Satisfying a request from the main free lists is accomplished as follows: +First, the smallest size class that is sufficiently large to satisfy the request +is determined. The free lists are then searched, starting from the list for the +determined size class and continuing in increasing order of size, until a nonempty +list is found. The request is then satisfied by the first block in that list +that is sufficiently large; *i.e.* a **first-fit policy** +(discussed in **Chapter 9.9.7 Page 849**) is applied within each individual free list. + +If there is no exact match for an allocation request in the quick lists, and there +is no block in the main free lists that is large enough to satisfy the allocation request, +`sf_mem_grow` should be called to extend the heap by an additional page of memory. +After coalescing this page with any free block that immediately precedes it, you should +attempt to use the resulting block of memory to satisfy the allocation request; +splitting it if it is too large and no "splinter" (*i.e.* a remainder smaller than the +minimum block size) would result. If the block of memory is still not large enough, +another call to `sf_mem_grow` should be made; continuing to grow the heap until either +a large enough block is obtained or the return value from `sf_mem_grow` indicates that +there is no more memory. + +As discussed in the book, segregated free lists allow the allocator to approximate a +best-fit policy, with lower overhead than would be the case if an exact best-fit policy +were implemented. The rationale for the use of quick lists is that when a small block +is freed, it is likely that there will soon be another allocation request for a block +of that same size. By putting the block in a quick list, it can be re-used for such +a request without the overhead of coalescing and/or splitting that would be required +if the block were inserted back into the main pool. + +## Splitting Blocks & Splinters + +Your allocator must split blocks at allocation time to reduce the amount of +internal fragmentation. Details about this feature can be found in **Chapter 9.9.8 Page 849**. +Due to alignment and overhead constraints, there will be a minimum useful block size +that the allocator can support. **For this assignment, pointers returned by the allocator +in response to allocation requests are required to be aligned to 16-byte boundaries**; +*i.e.* the pointers returned will be addresses that are multiples of 2^4. +The 16-byte alignment requirement implies that the minimum block size for your allocator +will be 32 bytes. No "splinters" of smaller size than this are ever to be created. +If splitting a block to be allocated would result in a splinter, then the block should +not be split; rather, the block should be used as-is to satisfy the allocation request +(*i.e.*, you will "over-allocate" by issuing a block slightly larger than that required). + +> :thinking: How do the alignment and overhead requirements constrain the minimum block size? +> As you read more details about the format of a block header, block footer, and alignment requirements, +> you should try to answer this question. + +## Freeing a Block + +When a block is freed, if it is a small block it is inserted at the front of the quick list of the +appropriate size. Blocks in the quick lists are free, but the allocation bit remains set in +the header to prevent them from being coalesced with adjacent blocks. In addition, there is a +separate "in quick list" bit in the block header that is set for blocks in the quick lists, +to allow them to be readily distinguished from blocks that are actually allocated. +To avoid arbitrary growth of the quick lists, the capacity of each is limited to `QUICK_LIST_MAX` blocks. +If an attempt is made to insert a block into a quick list that is already at capacity, +the quick list is *flushed* by removing each of the blocks it currently contains and adding +them back into the main free lists, coalescing them with any adjacent free blocks as described +below. After flushing the quick list, the block currently being freed is inserted into the +now-empty list, leaving just one block in that list. + +When a block is added into the main free lists, an attempt should first be made to +**coalesce** the block with any free block that immediately precedes or follows it in the heap. +(See **Chapter 9.9.10 Page 850** for a discussion of the coalescing procedure.) +Once the block has been coalesced, it should be inserted at the **front** of the free +list for the appropriate size class (based on the size after coalescing). +The reason for performing coalescing is to combat the external fragmentation +that would otherwise result due to the splitting of blocks upon allocation. +Note that blocks inserted into quick lists are not immediately coalesced; they are only +coalesced at such later time as the quick list is flushed and the blocks are moved into the +main free lists. This is an example of a "deferred coalescing" strategy. + +## Block Headers & Footers + +In **Chapter 9.9.6 Page 847 Figure 9.35**, a block header is defined as 2 words +(32 bits) to hold the block size and allocated bit. In this assignment, the header +will be 4 words (i.e. 64 bits or 1 memory row). The header fields will be similar +to those in the textbook but with some differences. + +**Block Header Format:** +```c + +----------------------------+----------------------+--------+--------+---------+---------+ <- header + | payload size | block_size | unused | alloc |prv alloc|in qklst | + | (0/1) |(4 LSB's implicitly 0)| (0) | (1) | (0/1) | (0) | + | (32 bits) | (28 bits) | 1 bit | 1 bit | 1 bit | 1 bit | + +---------------------------------------------------+--------+--------+---------+---------+ <- (aligned) +``` + +- The `payload_size` field, which occupies the four most-significant bytes of the header + of an allocated block, will be used to store the payload size that was requested by the client + for that block. In a free block (including a block in a quick list) this field be zero. +- The `block_size` field gives the number of bytes for the **entire** block (including header/footer, + payload, and padding). It occupies the four least-significant bytes of the block header or footer, + except that three of the four least-significant bits of the block size, which would normally always + be zero due to alignment requirements, are used to store additional information. + This means that these bits have to be masked when retrieving the block size from the header and + when the block size is stored in the header the previously existing values of these bits have + to be preserved. +- The `alloc` bit (bit 2, mask 0x4) is a boolean. It is 1 if the block is allocated and 0 if it is free. +- The `prev_alloc` (bit 1, mask 0x2) is also a boolean. It is 1 if the **immediately preceding** block + in the heap is allocated and 0 if it is not. +- The `in_qklst` (bit 0, mask 0x1) is also a boolean. It is 1 if the block is currently in a quick list, + and 0 if it is not. Note that if this bit is a 1, then the `alloc` bit will also be a 1. +- The remaining bit out of the four least-significant bits (bit 3, mask 0x8) is not used, and should + always be 0. + +> :scream: Note that the fact that only four bytes are available for storing the block +> size and payload size in a block header means that the maximum size of a block will +> be at most 2^32 bytes. The types of the arguments to the `sf_malloc()` and `sf_realloc()` +> functions reflect this limitation. + +Each free block will also have a footer, which occupies the last memory row of the block. +The footer of a free block (including a block in a quick list) must contain exactly the +same information as the header. In an allocated block, the footer will not be present, +and the space that it would otherwise occupy may be used for payload. + +> :thinking: Here is an example of determining the block size required to satisfy +> a particular requested payload size. Suppose the requested size is 25 bytes. +> An additional 8 bytes will be required to store the block header, which must always +> be present. This means that a block of at least 33 bytes must be used, however due +> to alignment requirements this has to be rounded up to the next multiple of the +> alignment size. If the alignment size were 16 bytes (which would be just large enough +> to enable the memory returned by the allocator to store in an aligned fashion any of +> the basic data types supported by the x86-64 architecture), then a block of at least +> 48 bytes would have to be used. As a result, there would be 15 bytes of "padding" +> at the end of the payload area, which contributes to internal fragmentation. +> Besides the header, when the block is free it is also necessary to store a footer, +> as well and next and previous links for the freelist. +> These will take an additional 24 bytes of space, however when the block is free there +> is no payload so the payload area can be used to store this information, assuming that +> the payload area is big enough in the first place. But the payload area is 40 bytes +> (25 bytes plus 15 bytes of padding), which is certainly bigger than 24 bytes, +> so a block of total size 48 would be fine. +> Note that a block cannot be smaller than 32 bytes, as there there would not then +> be enough space to store the header, footer, and freelist links when the block is free. + +## Obfuscation of Headers and Footers + +Your allocator has to satisfy one further requirement as regards the storage of the +block headers and footers. The headers and footers will not be stored directly in +memory; rather their contents will first be obfuscated by performing a bitwise XOR +(C operator `^`) with a "magic" value that is obtained by referencing the preprocessor +symbol `MAGIC`. This value is set randomly (by the utility code provided for you) +when the heap is first initialized. When a header or footer is read from memory, +it must again be XOR'ed with the magic value to expose the true contents. +The purpose of obfuscating the headers and footers in this way is to help detect attempts +to free pointers that were not obtained from a previous call to `malloc`, and also to make +it possible to detect some situations in which the heap has been corrupted by overwriting +of headers and/or footers. + +In the initial stages of debugging, you might find it helpful to turn off the header +and footer obfuscation. This can be accomplished by making an initial call of +`sf_set_magic(0x0)`. The effect of this is that the magic value will then always be `0x0`, +rather than a randomly chosen value. Once you have your code working with obfuscation +turned off in this way, don't forget to turn it back on again to test your code in the +correct configuration, because the `sf_set_magic()` function will be replaced by a dummy +version during grading. + +# Getting Started + +Fetch and merge the base code for `hw3` as described in `hw0` from the +following link: https://gitlab02.cs.stonybrook.edu/cse320/hw3 + +**Remember to use the `--strategy-option=theirs` flag with the `git merge` +command as described in the `hw1` doc to avoid merge conflicts in the Gitlab +CI file.** + +## Directory Structure + +
+.
+├── .gitignore
+├── .gitlab-ci.yml
+└── hw3
+    ├── hw3.sublime-project
+    ├── include
+    │   ├── debug.h
+    │   └── sfmm.h
+    ├── lib
+    │   └── sfutil.o
+    ├── Makefile
+    ├── src
+    │   ├── main.c
+    │   └── sfmm.c
+    └── tests
+        └── sfmm_tests.c
+
+ +The `lib` folder contains the object file for the `sfutil` library. This +library provides you with several functions to aid you with the implementation +of your allocator. **Do NOT delete this file as it +is an essential part of your homework assignment.** + +The provided `Makefile` creates object files from the `.c` files in the `src` +directory, places the object files inside the `build` directory, and then links +the object files together, including `lib/sfutil.o`, to make executables that +are stored to the `bin` directory. + +**Note:** `make clean` will not delete `sfutil.o` or the `lib` folder, but it +will delete all other contained `.o` files. + +The `sfmm.h` header file contains function prototypes and defines the format +of the various data structures that you are to use. + +> :scream: **DO NOT modify `sfmm.h` or the Makefile.** Both will be replaced when we run +> tests for grading. If you wish to add things to a header file, please create +> a new header file in the `include` folder + +All functions for your allocator (`sf_malloc`, `sf_free`, `sf_realloc`, +`sf_internal_fragmentation`, and `sf_peak_utilization`) +**must** be implemented in `src/sfmm.c`. + +The program in `src/main.c` contains a basic example of using the allocation functions. +Running `make` will create a `sfmm` executable in the `bin` directory. This can be run +using the command `bin/sfmm`. + +# Allocation Functions + +You will implement the three functions (`sf_malloc`, `sf_realloc`, and `sf_free`) +in the file `src/sfmm.c`. The file `include/sfmm.h` contains the prototypes and +documentation shown below. + +**Note:** Standard C library functions set `errno` when there is an error. +To avoid conflicts with these functions, your allocation functions will set `sf_errno`, +a variable declared as `extern` in `sfmm.h`. + +```c +/* + * This is your implementation of sf_malloc. It acquires uninitialized memory that + * is aligned and padded properly for the underlying system. + * + * @param size The number of bytes requested to be allocated. + * + * @return If size is 0, then NULL is returned without setting sf_errno. + * If size is nonzero, then if the allocation is successful a pointer to a valid region of + * memory of the requested size is returned. If the allocation is not successful, then + * NULL is returned and sf_errno is set to ENOMEM. + */ +void *sf_malloc(sf_size_t size); + +/* + * Resizes the memory pointed to by ptr to size bytes. + * + * @param ptr Address of the memory region to resize. + * @param size The minimum size to resize the memory to. + * + * @return If successful, the pointer to a valid region of memory is + * returned, else NULL is returned and sf_errno is set appropriately. + * + * If sf_realloc is called with an invalid pointer sf_errno should be set to EINVAL. + * If there is no memory available sf_realloc should set sf_errno to ENOMEM. + * + * If sf_realloc is called with a valid pointer and a size of 0 it should free + * the allocated block and return NULL without setting sf_errno. + */ +void *sf_realloc(void *ptr, sf_size_t size); + +/* + * Marks a dynamically allocated region as no longer in use. + * Adds the newly freed block to the free list. + * + * @param ptr Address of memory returned by the function sf_malloc. + * + * If ptr is invalid, the function calls abort() to exit the program. + */ +void sf_free(void *ptr); +``` + +> :scream: Make sure these functions have these exact names +> and arguments. They must also appear in the correct file. If you do not name +> the functions correctly with the correct arguments, your program will not +> compile when we test it. **YOU WILL GET A ZERO** + +# Statistics Functions + +Besides the allocation functions discussed above, you are to implement the +following two functions that return statistics about the memory utilization +of the allocator: + +```c +/* + * Get the current amount of internal fragmentation of the heap. + * + * @return the current amount of internal fragmentation, defined to be the + * ratio of the total amount of payload to the total size of allocated blocks. + * If there are no allocated blocks, then the returned value should be 0.0. + */ +double sf_internal_fragmentation(); + +/* + * Get the peak memory utilization for the heap. + * + * @return the peak memory utilization over the interval starting from the + * time the heap was initialized, up to the current time. The peak memory + * utilization at a given time, as defined in the lecture and textbook, + * is the ratio of the maximum aggregate payload up to that time, divided + * by the current heap size. If the heap has not yet been initialized, + * this function should return 0.0. + */ +double sf_peak_utilization(); +``` + +These functions are also to be implemented in `sfmm.c`. + +> Any functions other than `sf_malloc`, `sf_free`, `sf_realloc`, +> `sf_internal_fragmentation`, and `sf_peak_utilization` +> **WILL NOT** be graded. + +# Initialization Functions + +In the `lib` directory, we have provided you with the `sfutil.o` object file. +When linked with your program, this object file allows you to access the +`sfutil` library, which contains the following functions: + +```c +/* + * @return The starting address of the heap for your allocator. + */ +void *sf_mem_start(); + +/* + * @return The ending address of the heap for your allocator. + */ +void *sf_mem_end(); + +/* + * This function increases the size of your heap by adding one page of + * memory to the end. + * + * @return On success, this function returns a pointer to the start of the + * additional page, which is the same as the value that would have been returned + * by get_heap_end() before the size increase. On error, NULL is returned. + */ +void *sf_mem_grow(); + +/* The size of a page of memory returned by sf_mem_grow(). */ +#define PAGE_SZ ((sf_size_t)1024) +``` + +```c +/* + * @return The "magic number" used to obfuscate header and footer contents + * to make it difficult to free a block without having first succesfully + * malloc'ed one. To obtain the ability to turn off obfuscation using the + * -DWEAK_MAGIC compilation flag, you should not call this function directly + * but rather use the preprocessor symbol MAGIC where the magic number is + * required. + */ +sf_header sf_magic(); + +/* Define WEAK_MAGIC during compilation to use MAGIC of 0x0 for debugging purposes. */ +#ifndef WEAK_MAGIC +#define MAGIC (sf_magic()) +#else +#define MAGIC ((sf_header)0x0) +#endif +``` + +> :scream: As these functions are provided in a pre-built .o file, the source +> is not available to you. You will not be able to debug these using gdb. +> You must treat them as black boxes. + +# sf_mem_grow + +The function `sf_mem_grow` is to be invoked by `sf_malloc`, at the time of the +first allocation request to obtain an initial free block, and on subsequent allocations +when a large enough block to satisfy the request is not found. +For this assignment, your implementation **MUST ONLY** use `sf_mem_grow` to +extend the heap. **DO NOT** use any system calls such as **brk** or **sbrk** +to do this. + +Function `sf_mem_grow` returns memory to your allocator in pages. +Each page is 1024 bytes (1 KB) and there are a limited, small number of pages +available (the actual number may vary, so do not hard-code any particular limit +into your program). Each call to `sf_mem_grow` extends the heap by one page and +returns a pointer to the new page (this will be the same pointer as would have +been obtained from `sf_mem_end` before the call to `sf_mem_grow`. + +The `sf_mem_grow` function also keeps track of the starting and ending addresses +of the heap for you. You can get these addresses through the `sf_mem_start` and +`sf_mem_end` functions. + +> :smile: A real allocator would typically use the **brk**/**sbrk** system calls +> calls for small memory allocations and the **mmap**/**munmap** system calls +> for large allocations. To allow your program to use other functions provided by +> glibc, which rely on glibc's allocator (*i.e.* `malloc`), we have provided +> `sf_mem_grow` as a safe wrapper around **sbrk**. This makes it so your heap and +> the one managed by glibc do not interfere with each other. + +# sf_magic and MAGIC + +The `sf_magic` function returns the random bit pattern that is being used +to obfuscate header and footer contents. This bit pattern must be XOR'ed +with header or footer contents to obfuscate them before storing them in memory, +and it must also be XOR'ed with header or footer contents to de-obfuscate them +after reading them from memory. +Instead of calling the `sf_magic` function directly, you should use the +preprocessor symbol `MAGIC` to obtain the magic bit pattern. +The definition of `MAGIC` is affected by whether or not the `WEAK_MAGIC` +C preprocessor symbol is defined during compilation. If `WEAK_MAGIC` +was not defined, then `MAGIC` calls `sf_magic()` to obtain the magic bit_pattern. +If `WEAK_MAGIC` was defined, then `MAGIC` is defined to be `0x0`. +The latter is useful during debugging, so you can examine the contents of headers +and footers in `gdb` without being confused by the obfuscation. +You can arrange for `WEAK_MAGIC` to be defined during compilation by uncommenting +the `-D WEAK_MAGIC` in the `DFLAGS` setting in the `Makefile`. +Make sure that you test your code with `-D WEAK_MAGIC` commented out (as it is +in the basecode distribution), because this is how your code will be compiled +during grading. + +# Implementation Details + +## Memory Row Size + +The table below lists the sizes of data types (following Intel standard terminlogy) +on x86-64 Linux Mint: + +| C declaration | Data type | x86-64 Size (Bytes) | +| :--------------: | :----------------: | :----------------------: | +| char | Byte | 1 | +| short | Word | 2 | +| int | Double word | 4 | +| long int | Quadword | 8 | +| unsigned long | Quadword | 8 | +| pointer | Quadword | 8 | +| float | Single precision | 4 | +| double | Double precision | 8 | +| long double | Extended precision | 16 + +> :nerd: You can find these sizes yourself using the sizeof operator. +> For example, `printf("%lu\n", sizeof(int))` prints 4. + +In this assignment we will assume that each "memory row" is 8 bytes (64 bits) in size. +All pointers returned by your `sf_malloc` are to be 16-byte aligned; that is, they will be +addresses that are multiples of 16. This requirement permits such pointers to be used to +store any of the basic machine data types in a "naturally aligned" fashion. +A value stored in memory is said to be *naturally aligned* if the address at which it +is stored is a multiple of the size of the value. For example, an `int` value is +naturally aligned when stored at an address that is a multiple of 4. A `long double` value +is naturally aligned when stored at an address that is a multiple of 16. +Keeping values naturally aligned in memory is a hardware-imposed requirement for some +architectures, and improves the efficiency of memory access in other architectures. + +## Block Header & Footer Fields + +The various header and footer formats are specified in `include/sfmm.h`: + +```c + Format of an allocated memory block + +-----------------------------------------------------------------------------------------+ + | 64-bit-wide row | + +-----------------------------------------------------------------------------------------+ + + +----------------------------+----------------------+--------+--------+---------+---------+ <- header + | payload size | block_size | unused | alloc |prv alloc|in qklst | + | (0/1) |(4 LSB's implicitly 0)| (0) | (1) | (0/1) | (0) | + | (32 bits) | (28 bits) | 1 bit | 1 bit | 1 bit | 1 bit | + +---------------------------------------------------+--------+--------+---------+---------+ <- (aligned) + | | + | Payload and Padding | + | (N rows) | + | | + | | + +-----------------------------------------------------------------------------------------+ + + NOTE: For an allocated block, there is no footer (it is used for payload). + NOTE: The actual stored header is obfuscated by bitwise XOR'ing with MAGIC. + The above diagram shows the un-obfuscated contents. +``` + +```c + Format of a memory block in a quick list + +-----------------------------------------------------------------------------------------+ + | 64-bit-wide row | + +-----------------------------------------------------------------------------------------+ + + +----------------------------+----------------------+--------+--------+---------+---------+ <- header + | unused | block_size | unused | alloc |prv alloc|in qklst | + | (0) |(4 LSB's implicitly 0)| (0) | (1) | (0/1) | (1) | + | (32 bits) | (28 bits) | 1 bit | 1 bit | 1 bit | 1 bit | + +---------------------------------------------------+--------+--------+---------+---------+ <- (aligned) + | | + | Payload and Padding | + | (N rows) | + | | + | | + +-----------------------------------------------------------------------------------------+ + + NOTE: For a block in a quick list, there is no footer. +``` + +```c + Format of a free memory block + + + +----------------------------+----------------------+--------+--------+---------+---------+ <- header + | unused | block_size | unused | alloc |prv alloc|in qklst | + | (0) |(4 LSB's implicitly 0)| (0) | (0) | (0/1) | (0) | + | (32 bits) | (28 bits) | 1 bit | 1 bit | 1 bit | 1 bit | + +------------------------------------------------------------+--------+---------+---------+ <- (aligned) + | | + | Pointer to next free block | + | (1 row) | + +-----------------------------------------------------------------------------------------+ + | | + | Pointer to previous free block | + | (1 row) | + +-----------------------------------------------------------------------------------------+ + | | + | Unused | + | (N rows) | + | | + | | + +------------------------------------------------------------+--------+---------+---------+ <- footer + | unused | block_size | unused | alloc |prv alloc|in qklst | + | (0) |(4 LSB's implicitly 0)| (0) | (0) | (0/1) | (0) | + | (32 bits) | (28 bits) | 1 bit | 1 bit | 1 bit | 1 bit | + +------------------------------------------------------------+--------+---------+---------+ + + NOTE: For a free block, footer contents must always be identical to header contents. + NOTE: The actual stored footer is obfuscated by bitwise XOR'ing with MAGIC. + The above diagram shows the un-obfuscated contents. +``` + +The `sfmm.h` header file contains C structure definitions corresponding to the above diagrams: + +```c +#define IN_QUICK_LIST 0x1 +#define PREV_BLOCK_ALLOCATED 0x2 +#define THIS_BLOCK_ALLOCATED 0x4 + +typedef uint32_t sf_size_t; +typedef uint64_t sf_header; +typedef sf_header sf_footer; + +/* + * Structure of a block. + * The first field of this structure is actually the footer of the *previous* block. + * This must be taken into account when creating sf_block pointers from memory addresses. + */ +typedef struct sf_block { + sf_footer prev_footer; // NOTE: This actually belongs to the *previous* block. + sf_header header; // This is where the current block really starts. + union { + /* A free block contains links to other blocks in a free list. */ + struct { + struct sf_block *next; + struct sf_block *prev; + } links; + /* An allocated block contains a payload (aligned), starting here. */ + char payload[0]; // Length varies according to block size. + } body; +} sf_block; +``` + +For `sf_block`, the `body` field is a `union`, which has been used to emphasize +the difference between the information contained in a free block and that contained +in an allocated block. If the block is free, then its `body` has a `links` field, +which is a `struct` containing `next` and `prev` pointers. If the block is +allocated, then its `body` does not have a `links` field, but rather has a `payload`, +which starts at the same address that the `links` field would have started if the +block were free. The size of the `payload` is obviously not zero, but as it is +variable and only determined at run time, the `payload` field has been declared +to be an array of length 0 just to enable the use of `bp->body.payload` to obtain +a pointer to the payload area, if `bp` is a pointer to `sf_block`. + +> :thumbsup: You can use casts to convert a generic pointer value to one +> of type `sf_block *` or `sf_header *`, in order to make use of the above +> structure definitions to easily access the various fields. You can even cast +> an integer value to these pointer types; this is sometimes required when +> calculating the locations of blocks in the heap. + +When a block is free, it must have a valid footer whose contents are identical to the +header contents. We will use a "footer optimization" technique that permits a footer +to be omitted from allocated blocks; thereby making the space that would otherwise +be occupied by the footer available for use by payload. The footer optimization +technique involves maintaining a bit in the header of each block that can be checked +to find out if the immediately preceding block is allocated or free. +If the preceding block is free, then its footer can be examined to find out its +size and then the size can be used to calculate the block's starting address for the +purpose of performing coalescing. +If the preceding block is **not** free, then it has no footer, but as we can only +coalesce with a free block there is no need for the information that we would have +found in the footer, anyway. + +> :scream: Note that the `prev_footer` field in the `sf_block` structure is actually +> part of the **previous** block in the heap. In order to initialize an `sf_block` +> pointer to correctly access the fields of a block, it is necessary to compute the +> address of the footer of the immediately preceding block in the heap and then cast +> that address to type `sf_block *`. The footer of a particular block can be obtained +> by first getting an `sf_block *` pointer for that block and then using the contained +> information (*i.e.* the block size) to obtain the `prev_footer` field of the +> **next** block in the heap. The `sf_block` structure has been specified this way +> so as to permit it to be defined with a fixed size, even though the payload size +> is unknown and will vary. + +## Quick List and Free List Heads + +In the file `include/sfmm.h`, you will see the following declaration: + +```c +#define NUM_QUICK_LISTS 10 /* Number of quick lists. */ +#define QUICK_LIST_MAX 5 /* Maximum number of blocks permitted on a single quick list. */ + +struct { + int length; // Number of blocks currently in the list. + struct sf_block *first; // Pointer to first block in the list. +} sf_quick_lists[NUM_QUICK_LISTS]; + +#define NUM_FREE_LISTS 10 +struct sf_block sf_free_list_heads[NUM_FREE_LISTS]; +``` + +The array `sf_quick_lists` contains the heads of the quick lists, +which are maintained as **singly linked lists** accessed in LIFO fashion +(*i.e.* like stacks). The capacity of each quick list is limited to +a maximum of `QUICK_LIST_MAX` blocks. Inserting into a quick list that is +at capacity causes the quick list to be flushed as discussed elsewhere. + +The array `sf_free_list_heads` contains the heads of the main free lists, +which are maintained as **circular, doubly linked lists**. +Each node in a free list contains a `next` pointer that points to the next +node in the list, and a `prev` pointer that points the previous node. +For each index `i` with `0 <= i < NUM_FREE_LISTS` the variable `sf_free_list_head[i]` +is a dummy, "sentinel" node, which is used to connect the beginning and the end of +the list at index `i`. This sentinel node is always present and (aside from its `next` +and `free` pointers) does **not** contain any other data. If the list is empty, +then the fields `sf_freelist_heads[i].body.links.next` and `sf_freelist_heads[i].body.links.prev` +both contain `&sf_freelist_heads[i]` (*i.e.* the sentinel node points back to itself). +If the list is nonempty, then `sf_freelist_heads[i].body.links.next` points to the +first node in the list and `sf_freelist_heads[i].body.links.prev` points to the +last node in the list. +Inserting into and deleting from a circular doubly linked list is done +in the usual way, except that, owing to the use of the sentinel, there +are no edge cases for inserting or removing at the beginning or the end +of the list. +If you need a further introduction to this data structure, you can readily +find information on it by googling ("circular doubly linked lists with sentinel"). + +> :scream: You **MUST** use the `sf_free_list_heads` array for the heads +> of your free lists and you **MUST** maintain these lists as circular, +> doubly linked lists. +> The helper functions discussed later, as well as the unit tests, +> will assume that you have done this when accessing your free lists. + +> :scream: Note that the head of a freelist must be initialized before the list +> can be used. The initialization is accomplished by setting the `next` and `prev` +> pointers of the sentinel node to point back to the node itself. + +## Overall Structure of the Heap + +The overall structure of the allocatable area of your heap will be a sequence of allocated +and free blocks. +Your heap should also contain a prologue and epilogue (as described in the book, **page 855**) +to arrange for the proper block alignment and to avoid edge cases when coalescing blocks. +The overall organization of the heap is as shown below: + +```c + Format of the heap + + +-----------------------------------------------------------------------------------------+ + | 64-bit-wide row | + +-----------------------------------------------------------------------------------------+ + + +-----------------------------------------------------------------------------------------+ <- heap start + | | (aligned) + | Unused | + | (1 row) | + +----------------------------+----------------------+--------+--------+---------+---------+ <- header + | payload size |minimum block_size(32)| unused | alloc |prv alloc|in qklst | + | (0) |(4 LSB's implicitly 0)| (0) | (1) | (0/1) | (0) | prologue block + | (32 bits) | (28 bits) | 1 bit | 1 bit | 1 bit | 1 bit | + +------------------------------------------------------------+--------+---------+---------+ <- (aligned) + | | + | Unused Payload Area | + | (3 rows) | + +------------------------------------------------------------+--------+---------+---------+ <- header + | payload size | block_size | unused | alloc |prv alloc|in qklst | + | (0/1) |(4 LSB's implicitly 0)| (0) | (0/1) | (0/1) | (0/1) | first block + | (32 bits) | (28 bits) | 1 bit | 1 bit | 1 bit | 1 bit | + +------------------------------------------------------------+--------+---------+---------+ <- (aligned) + | | + | Payload and Padding | + | (N rows) | + | | + | | + +--------------------------------------------+------------------------+---------+---------+ + | | + | | + | | + | | + | Additional allocated and free blocks | + | | + | | + | | + +-----------------------------------------------------------------------------------------+ + | payload size | block_size | unused | alloc |prv alloc|in qklst | + | (0) | (0) | (0) | (1) | (0/1) | (0) | epilogue + | (32 bits) | (28 bits) | 1 bit | 1 bit | 1 bit | 1 bit | + +------------------------------------------------------------+--------+---------+---------+ <- heap_end + (aligned) + + NOTE: The actual stored epilogue is obfuscated by bitwise XOR'ing with MAGIC. + The above diagram shows the un-obfuscated contents. +``` + +The heap begins with unused "padding", so that the header of each block will start +`sizeof(sf_header)` bytes before an alignment boundary. +The first block of the heap is the "prologue", which is an allocated block of minimum +size with an unused payload area. + +At the end of the heap is an "epilogue", which consists only of an allocated header, +with block size set to 0. +The prologue and epilogue are never used to satisfy allocation requests and they +are never freed. +Whenever the heap is extended, a new epilogue is created at the end of the +newly added region and the old epilogue becomes the header of the new block. +This is as described in the book. + +We do not make any separate C structure definitions for the prologue and epilogue. +They can be manipulated using the existing `sf_block` structure, though care must be taken +not to access fields that are not valid for these special blocks +(*i.e.* `prev_footer` for the prologue and anything other than `header` and `prev_footer` +for the epilogue). + +As your heap is initially empty, at the time of the first call to `sf_malloc` +you will need to make one call to `sf_mem_grow` to obtain a page of memory +within which to set up the prologue and initial epilogue. +The remainder of the memory in this first page should then be inserted into +the free list as a single block. + +## Notes on sf_malloc + +When implementing your `sf_malloc` function, first determine if the request size +is 0. If so, then return `NULL` without setting `sf_errno`. +If the request size is non-zero, then you should determine the size of the +block to be allocated by adding the header size and the size of any necessary +padding to reach a size that is a multiple of 16 to maintain proper alignment. +Remember also that the block has to be big enough to store the footer +as well as the `next` and `prev` pointers when the block is free. +As these fields are not present in an allocated block this space can (and should) +be overlapped with the payload area. +As has already been discussed, the above constraints lead to a minimum block size +of 32 bytes, so you should not attempt to allocate any block smaller than this. +After having determined the required block size, you should first check the +quick lists to see if they contain a block of that size. +If they do not, you should determine the index of the first main free list +that would be able to satisfy a request of that size. +Search that free list from the beginning until the first sufficiently large +block is found. If there is no such block, continue with the next larger +size class. +If a big enough block is found, then after splitting it (if it will not leave +a splinter), you should insert the remainder part back into the appropriate +freelist. When splitting a block, the "lower part" should be used to +satisfy the allocation request and the "upper part" should become the remainder. +Do not insert this remainder portion into any quick list; it should be put +directly into the main free lists. + +If a big enough block is not found in any of the freelists, then you +must use `sf_mem_grow` to request more memory +(for requests larger than a page, more than one such call might be required). +If your allocator ultimately cannot satisfy the request, your `sf_malloc` function +must set `sf_errno` to `ENOMEM` and return `NULL`. + +### Notes on sf_mem_grow + +After each call to `sf_mem_grow`, you must attempt to coalesce the newly +allocated page with any free block immediately preceding it, in order to build +blocks larger than one page. Insert the new block at the beginning of +the appropriate main freelist (*not* a quick list). + +**Note:** Do not coalesce past the beginning or end of the heap. + +## Notes on sf_free + +When implementing `sf_free`, you must first verify that the pointer being +passed to your function belongs to an allocated block. This can be done by +examining the fields in the block header. In this assignment, we will consider +the following cases to be invalid pointers: + +- The pointer is `NULL`. +- The pointer is not 16-byte aligned. +- After XOR'ing the stored header with `MAGIC`: + * The block size is less than the minimum block size of 32. + * The block size is not a multiple of 16 + * The header of the block is before the start of the first block + of the heap, or the footer of the block is after the end of the last + block in the heap. + * The `allocated` bit in the header is 0. + * The `prev_alloc` field in the header is 0, indicating that the previous + block is free, but the `alloc` field of the previous block header is not 0. + +If an invalid pointer is passed to your function, you must call `abort` to exit +the program. Use the man page for the `abort` function to learn more about this. + +After confirming that a valid pointer was given, you must free the block. +If the block size matches the size of one of the quick lists, it should be +inserted into that quick list, flushing the quick list first if it is already +at capacity. Otherwise, the block is inserted at the *front* of the appropriate +main free list, after coalescing with any adjacent free block. + +Note that blocks in a main free list must **not** be marked as allocated, +and they must have a valid footer with contents identical to the block header. +In contrast, blocks in a quick list **are** marked as allocated, and they +do not have any footer. In addition, blocks in a quick list have the +`IN_QUICK_LIST` bit set in their header. + +# Notes on sf_realloc + +When implementing your `sf_realloc` function, you must first verify that the +pointer passed to your function is valid. The criteria for pointer validity +are the same as those described in the 'Notes on sf_free' section above. +If the pointer is valid but the size parameter is 0, free the block and return `NULL`. + +After verifying the parameters, consider the cases described below. +Note that in some cases, `sf_realloc` is more complicated than calling `sf_malloc` +to allocate more memory, `memcpy` to move the old memory to the new memory, and +`sf_free` to free the old memory. + +## Reallocating to a Larger Size + +When reallocating to a larger size, always follow these three steps: + +1. Call `sf_malloc` to obtain a larger block. + +2. Call `memcpy` to copy the data in the block given by the client to the block +returned by `sf_malloc`. Be sure to copy the entire payload area, but no more. + +3. Call `sf_free` on the block given by the client (inserting into a quick list +or main freelist and coalescing if required). + +4. Return the block given to you by `sf_malloc` to the client. + +If `sf_malloc` returns `NULL`, `sf_realloc` must also return `NULL`. Note that +you do not need to set `sf_errno` in `sf_realloc` because `sf_malloc` should +take care of this. + +## Reallocating to a Smaller Size + +When reallocating to a smaller size, your allocator must use the block that was +passed by the caller. You must attempt to split the returned block. There are +two cases for splitting: + +- Splitting the returned block results in a splinter. In this case, do not +split the block. Leave the splinter in the block, update the header field +if necessary, and return the same block back to the caller. + +**Example:** + +
+            b                                               b
++----------------------+                       +------------------------+
+| allocated            |                       |   allocated.           |
+| Blocksize: 64 bytes  |   sf_realloc(b, 32)   |   Block size: 64 bytes |
+| payload: 48 bytes    |                       |   payload: 32 bytes    |
+|                      |                       |                        |
+|                      |                       |                        |
++----------------------+                       +------------------------+
+
+ +In the example above, splitting the block would have caused a 24-byte splinter. +Therefore, the block is not split. + +- The block can be split without creating a splinter. In this case, split the +block and update the block size fields in both headers. Free the remainder block +by inserting it into the appropriate free list (after coalescing, if possible -- +do not insert the remainder block into a quick list). +Return a pointer to the payload of the now-smaller block to the caller. + +Note that in both of these sub-cases, you return a pointer to the same block +that was given to you. + +**Example:** + +
+            b                                              b
++----------------------+                       +------------------------+
+| allocated            |                       | allocated |  free      |
+| Blocksize: 128 bytes |   sf_realloc(b, 50)   | 64 bytes  |  64 bytes. |
+| payload: 80 bytes    |                       | payload:  |            |
+|                      |                       | 50 bytes  | goes into  |
+|                      |                       |           | free list  |
++----------------------+                       +------------------------+
+
+ +# Helper Functions + +The `sfutil` library additionally contains the following helper functions, +which should be self explanatory. They all output to `stderr`. + +```c +void sf_show_block(sf_block *bp); +void sf_show_blocks(); +void sf_show_free_list(int index); +void sf_show_free_lists(); +void sf_show_quick_list(int index); +void sf_show_quick_lists(); +void sf_show_heap(); +``` + +We have provided these functions to help you visualize your free lists and +allocated blocks. + +# Unit Testing + +For this assignment, we will use Criterion to test your allocator. We have +provided a basic set of test cases and you will have to write your own as well. + +You will use the Criterion framework alongside the provided helper functions to +ensure your allocator works exactly as specified. + +In the `tests/sfmm_tests.c` file, there are ten unit test examples. These tests +check for the correctness of `sf_malloc`, `sf_realloc`, and `sf_free`. +We provide some basic assertions, but by no means are they exhaustive. It is your +job to ensure that your header/footer bits are set correctly and that blocks are +allocated/freed as specified. + +## Compiling and Running Tests + +When you compile your program with `make`, a `sfmm_tests` executable will be +created in the `bin` folder alongside the `main` executable. This can be run +with `bin/sfmm_tests`. To obtain more information about each test run, you can +use the verbose print option: `bin/sfmm_tests --verbose`. +You might also find it helpful to suppress the running of tests concurrently +by giving the `--j1` option. +It is also possible to restrict the set of tests that are run. For example, +using `--filter suite_name/test_name` will only run the test named `test_name` +in test suite `suite_name` (if there is such a test, otherwise it will run +no tests). + +# Writing Criterion Tests + +The first test `malloc_an_int` tests `sf_malloc`. +It allocates space for an integer and assigns a value to that space. +It then runs an assertion to make sure that the space returned by `sf_malloc` +was properly assigned. + +```c +cr_assert(*x == 4, "sf_malloc failed to give proper space for an int!"); +``` + +The string after the assertion only gets printed to the screen if the assertion +failed (i.e. `*x != 4`). However, if there is a problem before the assertion, +such as a SEGFAULT, the unit test will print the error to the screen and +continue to run the rest of the unit tests. + +For this assignment **you must write 5 additional unit tests +which test new functionality and add them to `sfmm_tests.c` below the following +comment:** + +> :scream: You should definitely not regard the style in which the given tests +> have been written as an example of the correct way to write such tests. +> These handout tests have been deliberately coded in such a way as to to avoid +> giving away too much information about how you might write the allocator code. +> The tests contain many hard-coded numeric values and intentionally somewhat +> obscure pointer manipulations. You would do well **not** to follow this example, +> but rather to devise functions and macros that make your own code easier to write +> and to read. Exactly how you might do this has been left for you to work out! + +``` +//############################################ +//STUDENT UNIT TESTS SHOULD BE WRITTEN BELOW +//DO NOT DELETE THESE COMMENTS +//############################################ +``` + +> For additional information on Criterion library, take a look at the official +> documentation located [here](http://criterion.readthedocs.io/en/master/)! This +> documentation is VERY GOOD. + +# Hand-in instructions +Make sure your directory tree looks like it did originally after merging the basecode, +and and that your homework compiles. + +This homework's tag is: `hw3` + +
+$ git submit hw3
+
+ +# A Word to the Wise + +This program will be very difficult to get working unless you are +extremely disciplined about your coding style. Think carefully about how +to modularize your code in a way that makes it easier to understand and +avoid mistakes. Verbose, repetitive code is error-prone and **evil!** +When writing your program try to comment as much as possible. +Format the code consistently. It is much easier for your TA and the +professor to help you if we can quickly figure out what your code does. diff --git a/hw4-doc/README.md b/hw4-doc/README.md new file mode 100644 index 0000000..9978287 --- /dev/null +++ b/hw4-doc/README.md @@ -0,0 +1,621 @@ +# Homework 4 Scripting Language - CSE 320 - Spring 2022 +#### Professor Eugene Stark + +### **Due Date: Friday 4/15/2022 @ 11:59pm** + +## Introduction + +The goal of this assignment is to become familiar with low-level Unix/POSIX system +calls related to processes, signal handling, files, and I/O redirection. +You will implement an interpreter, called `mush`, for a simple scripting language +that is capable of managing multiple concurrently executing "jobs". + +### Takeaways + +After completing this assignment, you should: + +* Understand process execution: forking, executing, and reaping. +* Understand signal handling. +* Understand the use of "dup" to perform I/O redirection. +* Have a more advanced understanding of Unix commands and the command line. +* Have gained experience with C libraries and system calls. +* Have enhanced your C programming abilities. + +## Hints and Tips + +* We **strongly recommend** that you check the return codes of **all** system calls + and library functions. This will help you catch errors. +* **BEAT UP YOUR OWN CODE!** Use a "monkey at a typewriter" approach to testing it + and make sure that no sequence of operations, no matter how ridiculous it may + seem, can crash the program. +* Your code should **NEVER** crash, and we will deduct points every time your + program crashes during grading. Especially make sure that you have avoided + race conditions involving process termination and reaping that might result + in "flaky" behavior. If you notice odd behavior you don't understand: + **INVESTIGATE**. +* You should use the `debug` macro provided to you in the base code. + That way, when your program is compiled without `-DDEBUG`, all of your debugging + output will vanish, preventing you from losing points due to superfluous output. + +> :nerd: When writing your program, try to comment as much as possible and stay +> consistent with code formatting. Keep your code organized, and don't be afraid +> to introduce new source files if/when appropriate. + +### Reading Man Pages + +This assignment will involve the use of many system calls and library functions +that you probably haven't used before. +As such, it is imperative that you become comfortable looking up function +specifications using the `man` command. + +The `man` command stands for "manual" and takes the name of a function or command +(programs) as an argument. +For example, if I didn't know how the `fork(2)` system call worked, I would type +`man fork` into my terminal. +This would bring up the manual for the `fork(2)` system call. + +> :nerd: Navigating through a man page once it is open can be weird if you're not +> familiar with these types of applications. +> To scroll up and down, you simply use the **up arrow key** and **down arrow key** +> or **j** and **k**, respectively. +> To exit the page, simply type **q**. +> That having been said, long `man` pages may look like a wall of text. +> So it's useful to be able to search through a page. +> This can be done by typing the **/** key, followed by your search phrase, +> and then hitting **enter**. +> Note that man pages are displayed with a program known as `less`. +> For more information about navigating the `man` pages with `less`, +> run `man less` in your terminal. + +Now, you may have noticed the `2` in `fork(2)`. +This indicates the section in which the `man` page for `fork(2)` resides. +Here is a list of the `man` page sections and what they are for. + +| Section | Contents | +| ----------------:|:--------------------------------------- | +| 1 | User Commands (Programs) | +| 2 | System Calls | +| 3 | C Library Functions | +| 4 | Devices and Special Files | +| 5 | File Formats and Conventions | +| 6 | Games, et al | +| 7 | Miscellanea | +| 8 | System Administration Tools and Daemons | + +From the table above, we can see that `fork(2)` belongs to the system call section +of the `man` pages. +This is important because there are functions like `printf` which have multiple +entries in different sections of the `man` pages. +If you type `man printf` into your terminal, the `man` program will start looking +for that name starting from section 1. +If it can't find it, it'll go to section 2, then section 3 and so on. +However, there is actually a Bash user command called `printf`, so instead of getting +the `man` page for the `printf(3)` function which is located in `stdio.h`, +we get the `man` page for the Bash user command `printf(1)`. +If you specifically wanted the function from section 3 of the `man` pages, +you would enter `man 3 printf` into your terminal. + +> :scream: Remember this: **`man` pages are your bread and butter**. +> Without them, you will have a very difficult time with this assignment. + +## Getting Started + +Fetch and merge the base code for `hw4` as described in `hw1`. +You can find it at this link: https://gitlab02.cs.stonybrook.edu/cse320/hw4 + +Here is the structure of the base code: +
+.
+├── .gitlab-ci.yml
+└── hw4
+    ├── demo
+    │   └── mush
+    ├── include
+    │   ├── debug.h
+    │   ├── mush.h
+    │   ├── mush.tab.h
+    │   └── syntax.h
+    ├── Makefile
+    ├── rsrc
+    │   ├── bg_test.mush
+    │   ├── cancel_test.mush
+    │   ├── delete_test.mush
+    │   ├── fg_test.mush
+    │   ├── goto_test.mush
+    │   ├── list_test.mush
+    │   ├── loop1.mush
+    │   ├── loop2.mush
+    │   ├── pause_test.mush
+    │   ├── pipeline_test.mush
+    │   ├── run_test.mush
+    │   ├── stop_test.mush
+    │   └── wait_test.mush
+    ├── src
+    │   ├── execution.c
+    │   ├── jobs.c
+    │   ├── main.c
+    │   ├── mush.lex.c
+    │   ├── mush.tab.c
+    │   ├── program.c
+    │   ├── store.c
+    │   └── syntax.c
+    └── tests
+        └── base_tests.c
+
+ +If you run `make`, the code should compile correctly, resulting in an +executable `bin/mush`. If you run this program, it doesn't do very +much, because there are a number of pieces that you have to fill in. + +## `Mush`: Overview + +The `mush` language is a simple programming language which was roughly +inspired by the classic programming language BASIC. +A `mush` program consists of a set of *statements*, with one statement +per line of program text. +The syntax of statements is given by the following context-free grammar: + +``` + ::= list + | delete , + | run + | cont + | stop + | set = + | unset + | goto + | if goto + | source + | + | & + | wait + | poll + | cancel + | pause +``` + +Some kinds of statements have required *line numbers*, other kinds of +statements have no line numbers, and for some statements the line numbers +are optional. In general, when the `mush` interpreter reads a statement +without a line number, it is executed immediately, whereas when it reads +a statement with a line number it is not immediately executed, but instead +is saved in the *program store*. +The program store maintains a set of statements, each of which has a +line number. In addition, the program store maintains a *program counter*, +which keeps track of the next statement to be executed when `mush` is +in "run mode". + +The `list`, `delete`, `run`, and `cont` statements have no line numbers, +and so can only be executed immediately. The `list` statement causes +`mush` to list the contents of the program store. The `delete` statement +deletes statements from the program store whose line numbers lie within +a specified range. The `run` statement causes `mush` to reset the program +counter to the lowest-numbered statement in the program store and to begin +running automatically. The `cont` statement causes `mush` to continue +automatic execution that has been stopped by the execution of a `stop` statement. +Since a `stop` statement has a required line number, such a statement +can never be executed immediately, but rather only from the program store +during automatic execution. + +The remaining statements have optional line numbers, and so can be executed +either immediately or from the program store. +The `set` statement is used to set the value of a variable to be the result +of evaluating an expression. +The `unset` statement is used to un-set the value of a variable, leaving it +with no value. +The `goto` statement resets the program counter so that the next statement to +be executed is the one with the specified line number. +The `if` statement causes control to be transferred conditionally to the +statement with the specified line number, if the specified expression evaluates +to a non-zero number. +The `source` statement causes `mush` to interpret the statements in the specified +file before continuing on with the current program. + +A statement can also consist of a *pipeline*, to be executed either in the +"foreground" or in the "background". A pipeline consists of a sequence of +*commands*, separated by vertical bars (`|`), with possible +*input redirection*, specified using `<` followed by a filename, +*output redirection*, specified using `>` followed by a filename, +or *output capturing*, specified using `>@`. +A pipeline is executed by `mush` in much the same fashion as it would be +executed by a shell such as `bash`: a group of processes is created to run +the commands concurrently, with the output of each command in the pipeline +redirected to become the input of the next command in the pipeline. +If input redirection is specified, then the first command in the pipeline +has its input redirected from the specified file. +If output redirection is specified, then the last command in the pipeline +has its output redirected to the specified file. +If output capturing is specified, then the output of the last command in the +pipeline is read by the `mush` interpreter itself, which makes it available +as the value of a variable that can be referenced by the execution of +subsequent statements in the program. + +Each command in a pipeline consists of a nonempty sequence of *args*, +where the first arg in the command specifies the name of a program to be run +and the remaining args are supplied to the program as part of its argument +vector. In `mush`, an arg takes the form of an *atomic expression*, +which can be either a *string variable*, a *numeric variable*, +a *string literal*, or an arbitrary expression enclosed in parentheses. + +The syntax of pipelines, commands, and args is given by the following grammar: + +``` + ::= + | < + | > + | >@ + + ::= + | | + + ::= + + ::= + | + + ::= + + ::= + | + | + | ( ) +``` + +`Mush` supports *expressions* built up from *string variables*, +*numeric variables*, and *literal strings*, using various unary +and binary operators, as given by the following grammar: + +``` + ::= + | == + | < + | > + | <= + | >= + | && + | || + | ! + | + + | - + | * + | / + | % +``` + +A *string variable* consists of a `$` symbol followed by a *name*, +which is a sequence of alphanumeric characters and underscores, +beginning with an alphabetic character or an underscore. +A *numeric variable* is similar, except it uses a `#` symbol in place +of the `$`. +A *literal string* is either a *number*, which consists of digits, +a *word*, which consists of non-whitespace characters which do not otherwise +have some special meaning to `mush`, or a *quoted string*, which is enclosed +in double quotes and which may contain special characters. +A *filename* that appears in the input or output redirection part of a +pipeline is permitted to be either a word or a quoted string. +This allows simple filenames without special characters to be specified +without quotes. Filenames that contain special characters (including `/`) +must be specified as quoted strings. + +Here is a simple example of a `mush` program: + +``` +10 echo "Let's start!" +20 set x = 0 +30 date >@ +40 set d = $OUTPUT +50 echo The date and time is: $d +60 sleep 1 +70 set x = #x + 1 +80 if #x <= 10 goto 30 +90 stop +``` + +The remaining types of statements that `mush` understands have to do with +the manipulation of concurrently executing *jobs*. +Each time `mush` executes a pipeline statement, a new job is created. +`Mush` keeps track of the existing jobs in a *jobs table*. +Each job in the jobs table has an associated *job ID*, which is a nonnegative +integer that uniquely identifies the job. +After starting a job, `mush` sets the value of the `JOB` variable to be +the job ID of the job that was started. +For a foreground job, `mush` waits for the job to complete and then sets the +value of the `STATUS` variable to be the exit status of the job. +`Mush` then *expunges* the job from the jobs table. +For a background job, `mush` does not wait for the job to complete, but instead +continues execution. At a later time, a `wait` statement can be executed +in order to wait for the background job to complete, to collect its +exit status, and to expunge the job. Alternatively, a `poll` statement can +be executed to check whether the job has terminated without waiting if it +has not. If the job has terminated, then the exit status is collected and +the job is expunged with a `poll` statement, similarly to a `wait` statement. +Execution of a `cancel` statement makes an attempt to cancel a specified +background job. A `SIGKILL` signal is sent to the process group to which the +processes in the jobs belong. If the processes have not already terminated, +then they will terminate upon receiving the `SIGKILL` signal. +A `wait` statement may be used to wait for this termination to occur and +to expunge the canceled job from the jobs table. +Note that the `wait`, `poll`, and `cancel` statements all permit the use of an +arbitrary expression to specify the job ID. + +The final kind of statement that `mush` supports is the `pause` statement. +This statement causes execution to be suspended pending the receipt of a signal +that might indicate a change in the status of jobs in the jobs table. +When such a signal is received, execution continues. +This way, `mush` can wait for a change in job status without consuming an +excessive amount of CPU time. + +### Demonstration version + +To help you understand how `mush` is intended to behave, I have provided a +demonstration version as a binary with the assignment basecode. +This can be found as the executable `demo/mush`. +This demonstration version is intended as an aid to understanding only; +it should not be regarded as a specification of what you are to do. +It is likely that the demonstration version has some bugs or that its +behavior does not conform in some respects to what is stated here and in +the specifications in the basecode. + +## Tasks to be Completed + +Included in the basecode for this assignment is an implementation of a +parser for `mush` statements and the basic control structure of the +`mush` interpreter. A number of modules have been left for you to +implement. These are: + + * A *program store* module, which is used to hold a `mush` program + and manage the program counter. + + * A *data store* module, which is used to keep track of the current values + of the variables used in a `mush` program. + + * A *jobs* module, which keeps track of the currently executing jobs using + a jobs table, and implements job manipulation functions used to execute + and wait for pipelines, collect exit status, perform input and output + redirection, and implement the output capture feature of `mush`. + +### The Program Store Module + +Specifications and stubs for the functions that make up the program store module +of `mush` are given in the source file `src/program.c`. +Implementation of these functions from the specifications should be relatively +straightforward, so I will not spend additional space on them here. +The choice of data structure used to represent the program store has been left +to you. +Pay close attention to what the specifications say about who has the responsibility +for freeing the memory associated with statements in the store. +A correct implementation should not leak memory associated with program statements, +and of course it should not suffer from double free bugs and the like. + +### The Data Store Module + +Specifications and stubs for the functions that make up the data store module +of `mush` are given in the source file `src/store.c`. +Once again, I expect that implementation of these functions should be relatively +straightforward. As for the program store, the choice of data structure used +to implement the data store is for you to make and you should pay attention to +what the specifications say about who is responsible for freeing memory. + +### The Jobs Module + +Specifications and stubs for the functions that make up the jobs module +of `mush` are given in the source file `src/jobs.c`. +It is this module that is likely to be unfamiliar and to present some challenges +to you, so I am providing some additional guidance here. + + * You will need to implement some form of "jobs table" in this module, + to keep track of the jobs that have been created but not yet expunged. + The data structure you use is up to you. If you find it convenient, + you may assume that at most `JOBS_MAX` jobs can exist at one time, + where `JOBS_MAX` is a C preprocessor symbol defined in `mush.h`. + Write your code so that it does not depend on a particular value for + `JOBS_MAX`; do not hard-code the value into your implementation. + + * Your jobs module will need to make use of handlers for two types of signals. + The first is the `SIGCHLD` signal used to obtain notifications when a child + process terminates. This has been discussed in class and can also be found + in the textbook. + The second type of signal you will need to handle is the `SIGIO` signal used + to obtain notifications when a file descriptor is ready for reading. + This will be important to enable your program to capture output from + concurrently executing background jobs without the need to commit to waiting + for data from any one of them at any particular time. This is discussed + further below. + + * For correct operation, your implementation will likely have to make use of + the `sigprocmask()` function to mask signals during times when a signal handler + should be prevented from running. You will likely also need to use the + `sigsuspend()` function under certain circumstances to await the arrival of a + signal. + + * When executing a pipeline consisting of N commands, a total of N+1 processes + should be used. One of these processes, which we will call the pipeline + *leader*, should be the direct child of the main `mush` process. + The remaining `N` processes will be children of the leader process, and will + each execute one of the commands in the pipeline. + The leader process should set itself into a new process group using its own + process ID as the process group ID, and its `N` child processes should belong + to this process group. This is so that job cancellation can be performed by + sending just one `SIGKILL`, directed at the process group for the job. + The leader process should wait for and reap its `N` children before terminating. + The main `mush` process should use its `SIGCHLD` handler to receive notifications + about the termination of pipeline leader processes and to collect their + exit status. + + * Besides the `fork()` system call used to create the processes, the creation of the pipeline + will involve the use of the `open()`, `pipe()`, and `dup2()` system calls to set up the pipes + and redirections, and the `execvp()` system call must be used to execute the individual + commands. + + > **Important:** You **must** create the processes in a pipeline using calls to + > `fork()` and `execvp()`. You **must not** use the `system()` function, nor use any + > form of shell in order to create the pipeline, as the purpose of the assignment is + > to give you experience with using the system calls involved in doing this. + + * Once having set up the pipeline, the pipeline leader will use `wait()` or `waitpid()` + to await the completion of the processes in the pipeline. + The leader process should wait for all of its children to terminate before + terminating itself. The leader should return the exit status of the process + running the last command in the pipeline as its own exit status, if that + process terminated normally. If the last process terminated with a signal, + then the leader should terminate via SIGABRT. + + * The `pipe()` and `dup2()` system calls should be used to perform the input + and output redirection associated with a pipeline, as discussed in class and + in the textbook. Files used for input and output redirection should be opened + using the `open()` system call. For correct operation of a pipeline, care + should be taken while setting up the pipeline that each process makes sure to + `close()` pipe file descriptors that it does not use. + + * The capturing of output from a pipeline by the main `mush` process is to be + accomplished as follows. Before forking the pipeline leader, a pipe should + be created to provide a way to redirect output from the last process in the + pipeline back to the main `mush` process. The redirection will be accomplished + using `dup2()` as usual. The main `mush` process will need to save the file + descriptor for the read side of the pipe in the jobs table along with other + state information from that job. Output from the pipeline will be collected + by the main `mush` process by reading from the read side of the pipe and + saving what is read in memory. Automatic dynamic allocation of however much + memory is required to hold the output can be accomplished by using the + `open_memstream()` function to obtain a `FILE` object to which the data can + be written. + + The main technical issue involved in output capturing is how to arrange for + the main `mush` process to collect the output produced from multiple + concurrently executing pipelines, without having to block waiting for any one + of them to produce output at any given time. This can be done using so-called + *asynchronous I/O*. When the main `mush` process creates the pipe from which + it will read the captured data, it should perform the following system calls + (`readfd` is the file descriptor for the read side of the pipe): + + ``` + fcntl(readfd, F_SETFL, O_NONBLOCK); + fcntl(readfd, F_SETFL, O_ASYNC); + fcntl(readfd, F_SETOWN, getpid()); + ``` + + The first of these calls enables *non-blocking I/O* on the file descriptor. + This means that an attempt to `read()` the file descriptor when no data is + available will not cause the main `mush` process to block (*i.e.* wait for + data to arrive); rather the `read()` will return immediately with an error + and `errno` set to `EWOULDBLK`. + The second call sets *asynchronous mode* on the file descriptor. + When this is set, the operating system kernel will send a `SIGIO` signal + whenever there has been a change in status of the file descriptor; for example, + whenever data becomes available for reading. + The third call is necessary to set the "ownership" of the file descriptor + to the main `mush` process, so that the kernel knows to which process + the `SIGIO` signals should be directed. + + Once you have done this, then the main `mush` process can use a handler for + `SIGIO` signals to become notified when there is output that needs to be + captured. It can then poll each of the file descriptors from which output + is supposed to be captured, using `read()` to read input from each of them + and save it in memory, until `EWOULDBLK` indicates that there is no more data + currently available. This way, it can collect the captured output in a timely + fashion without getting "stuck" waiting for output that might take an + indefinite amount of time to arrive. + + For more information, you will have to look at the man pages for the various + system calls involved, including `pipe()`, `dup2()`, `fcntl()`, `open()`, `read()`, + `signal()` (or `sigaction()`), `sigprocmask()`, and `sigsuspend()`. + +## Using `gdb` to Debug Multi-process Programs + +Although it gets harder to debug using `gdb` once multiple processes are involved, +there is some support for it. The `gdb` command `set follow-fork-mode parent` +causes `gdb` to follow the parent process after a `fork()` (this is the default). +Similarly, the command `set follow-fork-mode child` causes `gdb` to follow the child +process instead. + +## Provided Components + +### The `mush.h` Header File + +The `mush.h` header file that we have provided gives function prototypes for +the functions that you are to implement, and contains a few other related +definitions. The actual specifications for the functions will be found +as comments attached to stubs for these functions in the various C source files. + + > :scream: **Do not make any changes to `mush.h`. It will be replaced + > during grading, and if you change it, you will get a zero!** + +### The `syntax.h` Header File + +The `syntax.h` header file that we have provided defines the data structures +used to represent parsed `mush` statements. Mostly, you don't have to know +much about the details of these data structures, except, for example, +that you will need to be able to extract some information from them, +such as the pipeline from a foreground or background pipeline statement. +To avoid memory leaks, you will need to use the various `free_xxx()` +functions provided to free syntactic objects when they are no longer being used. +You will also need to use the function provided to make a copy of a pipeline +object in a certain situation -- see the specification for `jobs_run()` for +more information. + + > :scream: **Do not make any changes to `syntax.h`. It will be replaced + > during grading, and if you change it, you will get a zero!** + +### The `syntax.c` Source File + +The `syntax.c` source file that we have provided contains the implementations +of the various functions for which prototypes are given in `syntax.h`. + + > :scream: **Do not make any changes to `syntax.c`. It will be replaced + > during grading, and if you change it, you will get a zero!** + +### The `mush.lex.c`, `mush.tab.c`, and `mush.tab.h` Files + +The basecode provides a parser for the `mush` language. This parser is +implemented using the GNU `bison` parser generator. and the GNU `flex` +lexical analyzer generator. The `mush.lex.c`, `mush.tab.c`, and `mush.tab.h` +files are auto-generated files produced by the `bison` and `flex` programs. + + > :scream: **None of these files should be changed or edited. + > Do *not* do the sloppy things that lots of people seem to do, + > namely, editing these files, reformatting them or otherwise mutating them, + > and then committing the changed results to `git`. You will regret it + > if you do this, and you have been duly warned!** + +### The `demo/mush` Executable + +The file `demo/mush` is an executable program that behaves more or less like +how your program should behave when it is finished. + + > :scream: The behavior of the demo program should be regarded as an example + > implementation only, not a specification. If there should be any discrepancy + > between the behavior of the demo program and what it says either in this document + > or in the specifications in the header files, the latter should be regarded + > as authoritative. + +### The `rsrc` Directory + +The `rsrc` directory contains some sample `mush` scripts which I used while +writing the demo version. They were mostly designed very quickly to exercise +the basic features of `mush`, to verify that they worked to a first cut. +One way to run them is to type *e.g.* `source rsrc/xxx_test.mush` to the +`mush` prompt, to get it to read and execute the test. +If you have run one test and you want to run another, you should use the +`delete` command to clear any statements from the program store that might +have been left by the first test, otherwise they might interfere with the +new test. + +### The `tests` Directory + +The `tests` directory contains just one file, `base_tests.c`, which contains one +Criterion test that isn't very interesting. This file is basically just a +placeholder where you can put tests you might think of yourself. + +## Hand-in instructions +As usual, make sure your homework compiles before submitting. +Test it carefully to be sure that doesn't crash or exhibit "flaky" behavior +due to race conditions. +Use `valgrind` to check for memory errors and leaks. +Besides `--leak-check=full`, also use the option `--track-fds=yes` +to check whether your program is leaking file descriptors because +they haven't been properly closed. +You might also want to look into the `valgrind` `--trace-children` and related +options. + +Submit your work using `git submit` as usual. +This homework's tag is: `hw4`. diff --git a/reference_doc/CSE320_ReferenceDoc.md b/reference_doc/CSE320_ReferenceDoc.md new file mode 100644 index 0000000..e3aace2 --- /dev/null +++ b/reference_doc/CSE320_ReferenceDoc.md @@ -0,0 +1,1280 @@ +# CSE 320 Reference + +**NOTE: This document has traditionally been provided (in PDF form) at the beginning +of the course; however, it was written in the ancient past and the source was no longer +available. This version (in Markdown) has been reverse-engineered from the PDF source, +so that it can be updated in the future. The reverse engineering turned up some errors +in the original document, and it likely introduced new errors. But now the errors can +be corrected if somebody reports them :smiley:.** + +## Using the Terminal + +Great resources for understanding and working with command line: + +[http://www.ibm.com/developerworks/library/l-lpic1-103-1/](http://www.ibm.com/developerworks/library/l-lpic1-103-1/) + +[https://learnpythonthehardway.org/book/appendixa.html](https://learnpythonthehardway.org/book/appendixa.html) + +## GCC + +```c +#include +#include + +int main(int argc, char* argv[]) { + printf("Hello World!\n"); + return EXIT_SUCCESS; +} +``` + +### Lines 1 and 2 + +Lines 1 and 2 are the C **preprocessor** statements which include +**function prototypes** for some of the functions in the **C standard library** +(aka libc). For now you can just vaguely relate these to the `import` +statements you might find atthe top of a java file. + +```java +import java.util.scanner; +``` + +The C preprocessor is a very powerful tool and you will learn about it +in future assignments. For now, just accept this basic explanation of +what these two lines do. The `#include` directive takes the contents of +the `.h` file and copies it into the `.c` file before the C compiler +actually translates the C code. + +> :nerd: Files that end in .h are called header files. They typically + contain preprocessor macros,function prototypes, **struct information**, + and **typedefs**. + +### Line 4 + +Line 4 is how you describe the `main()` function of a C program. In C, +if you are creating an executable program it must have one and ONLY one +main function. It should also be as isolated as possible, if you can +(and for this class you should always) have `main()` in its own `.c` +file. Any main function you write in this course MUST return an integer +value (in older textbooks/documentation they might return `void`; watch +out). + +This is sort of similar to the `main()` declaration in Java. In Java, +arrays, since they are objects, have various different attributes (*e.g.* +length). C is not an object oriented language and hence arrays contain +no such information (arrays in C are very similar to arrays in +MIPS). To remedy this issue two arguments are passed: `argc`, +which contains how many elements are in the array and `argv`, which is an +array of strings which contains each of the arguments passed on the +command line. Even if no arguments are passed by the user, `argv` will +contain at least one argument which is the name of the binary being +executed. + +> :nerd: If you look through other C programs, you might see that + there are quite a few different ways to declare `main`. In this course + you may declare `main` just as it is in the `helloworld` example unless + specified otherwise in the homework assignment. + +> :scream: It is crucial that there exists exactly one `main()` function + in your whole program. C is not like Java, where you can have a + different main in every file and then choose which main you want to + run. If you have more than one main when you try to compile it will + give you an error. For example, assume you had two files `main1.c` and + `main2.c` and you tried to compile them both into one program + (reasonable thing to do). If both, `main1.c` and `main2.c`, have a main + function defined in them, when you try to compile it you get the + following linker error: + + ``` + /tmp/cc8eYGEA.o: In function ‘main’: + main2.c:(.text+0x0): multiple definition of ‘main’ + /tmp/ccaaqneq.o:main1.c:(.text+0x0): first defined here + collect2: error: ld returned 1 exit status + ``` + + This error means that the main function is defined twice within your + program. This concept extends to all functions. Two functions *CAN NOT* + have the same name under normal conditions. In addition, function + overloading is not allowed in C. Example: Assume you had the file + func.c with the following function declarations. + + ```c + void func(int a); + void func(int a, int b); + ``` + + This will result in the following error + + ``` + func.c:5:6:error: conflicting types for ‘func’ + void func(int a, int b) { + ^ + func.c:1:6: note: previous definition of ‘func’ was here + void func(int a) { + ``` + +### Line 5 + +Line 5 is how this program is printing out its values to standard +output (stdout). The printf function can be compared to the +System.out.printf() function in Java. This function accepts a char* +argument known as the format string (assume for now char* is equivalent +to the Java String type). This will work fine for when you know ahead +of time what you want to print, but what if you want to print a +variable? + +If you assume C is like Java, you may try to concatenate strings in +the following form: + +```java +int i = 5; +printf("The value of i is " + i + "\n"); +``` + +If you try to compile this code, GCC may give you some of the +following cryptic error messages: + +``` +error: invalid operands to binary + (have ‘char *’ and ‘char *’) +``` + +or + +``` +warning: format not a string literal and no format arguments [-Wformat-security] +``` + +Unfortunately C, does not have string concatenation via the + +operator. However, the `printf()` function also takes a variable number +of arguments after the format string. In order to print a variable you +have to specify one of many available **conversion specifiers** +(character(s) followed by a % sign). Below is an example of how to +print an integer in C. + +> :nerd: You can view a list of all printf formats here. Alternatively + you can use the command `man 3 printf` in your terminal to view the + documentation for printf as well. This is an example of a man + page (manual page). Man pages are how most of the library functions in + C are documented. You are highly encouraged to utilize them as they are + extremely useful and highly beneficial. Man pages are also available + online. + +The printf function always prints to the filestream known as `stdout` +(standard output). There are three **standard streams** that are usually +available to each program, namely: `stdin` (standard input), `stdout`, and +`stderr` (standard error). Prior to `*nix`, computer programs needed to +specify and be connected to a particular I/O device such as magnetic +tapes. This made portability nearly impossible. Later in the course we +will delve deeper into “files” and how they represent abstract devices +in Unix-like operating systems. For now understand that they work +muchlike your typical .txt file. They can written to and read from. + +### Line 6 + +Line 6 is the end of the main function. The value returned in main is +the value that represents the return code of the program. In `*nix` when +a program exits successfully, the value returned is usually zero. When +it has some sort of an error, the value is usually a non-zero +number. Since these values are defined by programmers and they may +be different depending on the system you are using, it is usually best +to use the constants `EXIT_SUCCESS` and `EXIT_FAILURE` which are defined in +`stdlib.h` for simple cases as they will represent the respective exit +codes for each system. + +> The term `*nix` is used for describing operating systems that are + derived from the *Unix* operating system (ex. BSD, Solaris) or clones of + it (ex. Linux). + +## Compiling C Code + +Begin compiling the following program: + +```c +#include +#include + +int main(int argc, char* argv[]) { + printf("Hello World!\n"); + return EXIT_SUCCESS; +} +``` + +Navigate on the command line to where the `.c` file is located. If the +file was called `helloworld.c`, type the following command to compile the +program. + +``` +$ gcc helloworld.c +``` + +> The `$` is the commandline prompt. **Your prompt may differ**. + +If no messages print, that means there were no errors and the +executable was produced. To double check that your program produced a +binary you can type the `ls` command to list all items in the directory. + +``` +$ ls +a.out helloworld.c +$ +``` + +The file **`a.out`** is your executable program. To run this program, +put a `./` in front of the binary name. + +``` +$ ./a.out +Hello World! +$ +``` + +> The `./` has a special meaning. The `.` translates to the path of the +current directory. So if your file was in the cse320 directory on the +user’s desktop then when you type `./a.out` this would really +translate to the path `/home/user/Desktop/cse320/a.out`. + +## Compilation Flags + +Modify the `helloworld` program to sum up the values from 0 to 5. + +```c +#include +#include + +int main(int argc, char *argv[]) { + int i, sum; + for(i = 0; i < 6; i++) { + sum += i; + } + printf("The sum of all integers from 0-5 is: %d\n", sum); + return EXIT_SUCCESS; +} +``` + +Compile and run this program. + +``` +$ gcc helloworld2.c +$ ./a.out + +The sum of all integers from 0-5 is: 15 +$ +``` + +This program compiled with no errors and even produced the correct +result. However, there is a subtle but hazardous bug in this code. The +developers of the **gcc C compiler** have built in some functionalities +(enabled by flags) to help programmers find them. + +Add the flags `-Wall` and `-Werror` to the `gcc` command when compiling. As so: + +``` +$ gcc -Wall -Werror helloworld2.c +helloworld2.c:7:3: error: variable 'sum' is uninitialized when used here + [-Werror,-Wuninitialized] + sum += i; + ^~~ +helloworld2.c:5:12: note: initialize the variable 'sum' to silence this warning + int i, sum; + ^ + = 0 +1 error generated. +$ +``` + +> Depending on your compiler (gcc, clang, etc.) the above error and + message may differ. Recent versions of gcc only produce an error when + optimization (`-O1`, `-O2`, or `-O3`) is enabled. + +> The flag `-Wall` enables warnings for all constructions that some users + consider questionable, and that are easy to avoid (or modify to prevent + the warning), even in conjunction with macros. + +> The flag `-Werror` converts all warnings to errors. Source code +> which triggers warnings will be rejected. + +This error means that the variable `sum` was used without being +initialized. Why does this matter? The C language does not actually +specify how the compiler should treat uninitialized +variables. Implementations of the C compiler may zero them out for you, +but really there is no specification of how this situation should be +handled. This can lead to undefined behavior and cause the program to +work one way one system and differently on other systems. To fix this +error, simply initialize the variable sum to the value desired (0). + +```c +#include +#include + +int main(int argc, char *argv[]) { + int i, sum = 0; + for(i = 0; i < 6; i++) { + sum += i; + } + printf("The sum of all integers from 0-5 is: %d\n", sum); + return EXIT_SUCCESS; +} +``` + +Compile the program again and you should no longer see any errors. + +``` +$ gcc -Wall -Werror helloworld2.c +$ ./a.out +The sum of all integers from 0-5 is: 15 +$ +``` + +> :scream: In this class, you *MUST ALWAYS* compile your assignments +> with the flags `-Wall -Werror`. This will help you locate mistakes in +> your program and the grader will compile your assignment withthese +> flags as well. Consider this your warning, `-Wall -Werror` are +> necessary. Do not progress through your assignment without using +> these flags and attempt to fix the errors they highlight last minute. + +## GNU Make and Makefiles + +As you program more in C, you will continue to add more flags and more +files to your programs. To type these commands over and over again will +eventually become an error laden chore. Also as you add more files, if +you rebuild every file every time, even if it didn’t change, it will +take a long time to compile your program. To help alleviate this issue +build tools were created. One such tool is GNU Make (you will be +required to use Make in this class). Make itself has lots of options +and features that can be configured. While mastering Make is not +required from this class, you will probably want to learn how to make +simple changes to what we supply. + +Refer +[here](http://www.cs.colby.edu/maxwell/courses/tutorials/maketutor/) +for a great Makefile tutorial and information resource. **You will +always be provided with a working makefile, this is provided for +extended learning.** + +[http://www.cs.colby.edu/maxwell/courses/tutorials/maketutor/](http://www.cs.colby.edu/maxwell/courses/tutorials/maketutor/) + +## Header Files + +There are some coding practices that you should become familiar with +in C from the beginning. The C compiler reads through your code once +and only once. This means all functions and variables you use must be +declared in advance of their usage or the compiler will not know how to +compile and exit with errors. This is why we have header files, we +declare all of our function prototypes in a `.h` file and +`#include` it in our `.c` file. This is so we can write the body of our +functions in any order and call them in any order we please. + +A header file is just a file which ends in the `.h` extension. Typically +you declare **function prototypes**, define `struct` and `union` types, +`#include` other header files, `#define` constants and macros, and +`typedef`. Some header files also expose global variables, but this is +strongly discouraged as it can cause compilation errors. + +When you define function prototypes in a `.h` file, you can then define +the body of the function inside of any `.c` file. Though typically, if +the header file was `called example.h`, we would define the functions in +`example.c`. If we were producing a massive library like +[stdlibc](https://en.wikipedia.org/wiki/C_standard_library), you +may instead declare all the function prototypes in a single header file +but put each function definition in its own file. It’s all +a preference, but these are two common practices. You should never be +defining function bodies in the header though, this will just cause you +issues later. + +There are two ways to specify where the include directive looks for +header files. If you use `<>`, when the preprocessor encounters the +include statement it will look for the file in a predefined location +on your system (usually `/usr/include`). If you use `""`, the preprocessor +will look in the current directory of the file being +processed. Typically system and library headers are included using `<>`, +and custom headers that you have made for your program are included +using `""`. + +### Header file example + +```c +#include +#include +#include + +#define TRUE 1 +#define FALSE 0 + +struct student { + char *first_name; + char *last_name; + int age; + float gpa; +}; + +int foo(int a, int b); +void bar(void); +``` + + +```c +#include"example.h" + +int main(int argc, char *argv[]){ + bar(); + return EXIT_SUCCESS; +} + +void bar(void){ + printf("foo: %d", foo(2, 3)); +} + +int foo(int a, int b) { + return a * b; +} +``` + +### Header Guard a.k.a Include Guard + +While using header files solves one issue, they create issues of their +own. What if multiple files include the same header file? What if +header file A includes header file B, and header file B includes +header file A? If we keep including the same header file multiple +times, this will make our source files larger than needed and slow +down the compilation process. It may also cause errors if there are +variables declared in the code. If two files keep including each other +how does the compiler know when to stop? To prevent such errors one +must utilize **header guards**. The header guard is used to prevent double +and cyclic inclusion of a header file. + +### Header Guard example + +In grandparent.h: + +```c +struct foo { + int member; +}; +``` + +In parent.h: + +```c +#include "grandparent.h" +``` + +In child.h: + +```c +#include "grandparent.h" +#include "parent.h" +``` + +The linker will create a temporary file that has literal copies of the +`foo` definition twice and this will create a compiler error since the +compiler does not know which definition takes precedence. The fix: + +In grandparent.h: + +```c +#ifndef GRANDFATHER_H +#define GRANDFATHER_H +struct foo { + int member; +}; +#endif +``` + +In parent.h: + +```c +#include "grandparent.h" +``` + +In child.h: + +```c +#include "grandparent.h" +#include "parent.h" +``` + +`ifndef`, `#define`, `#endif` are preprocessor macros that +prevent the double inclusion. This is because when the `father.h` file +includes `grandfather.h` for the second time the `#ifndef` macro returns +false so the second definition for `foo` is never included. +Read [here](https://en.wikipedia.org/wiki/Include_guard#Double_inclusion) +for more information. + +> You should always use header files and guards in your + assignments. Newer compilers now support what is known as `#pragma once`. + This directive performs the same operation as the header guard, + but it may not be a cross platform solution when considering + older machines. + +### Directory Structure + +To help with a clear and consistent structure to your programs, you +can use the following directory structure. This is a common directory +structure for projects in C. + +``` +. +├── Makefile +├── include +│   ├── debug.h +│   └── func.h +└── src + ├── main.c + └── func.c +``` + +> :scream: You will be **REQUIRED** to follow this structure for **ALL** the homework + assignments for this class. Failure to do so will result in a ZERO. + +## Datatype Sizes + +Depending on the system and the underlying architecture, which can +have different word sizes etc., datatypes can have various different +sizes. In a language like Java, much of these issues are hidden from +the programmer. The JVM creates another layer of abstraction which can +allow the programmer to believe all datatypes are of same size no +matter the underlying architecture. C, on the other hand, does not +have this luxury. The programmer has to consider everything about the +system being worked on. To make programs cross platform, code and +logic needs to be tested, comparing results and output, and altered +accordingly. + +C lacks the ability to add new datatypes to its +specification. Instead, it works with models known as LP64, +ILP64, LLP64, ILP32, and LP32. The `I` stands for `INT`, the `L` stands for +`LONG` and the `P` stands for `POINTER`. The number after the letters +describes the maximum bit size of the data types. + +The typical sizes of these models are described below in the following +table (in bits): + +``` +TABLE WAS MISSING IN ORIGINAL -- NEED TO RECONSTRUCT! +``` + +Notice that the size of an integer on one machine could be different +from that on another machine depending on which model the machine +runs. To prove this to yourself, use the special operator in the C +language known as `sizeof`. The operator `sizeof` will tell you the size of +a specific datatype in bytes. As an exercise, you should create the +following program and run it in your development environment and on +a system with a different underlying architecture (such as 'Sparky') +and compare the results. + +```c +#include +#include + +int main(int argc, char *argv[]) { + /* Basic data types */ + printf("=== Basic Data Types ===\n"); + printf("short: %lu bytes\n", sizeof(short)); + printf("int: %lu bytes\n", sizeof(int)); + printf("long: %lu bytes\n", sizeof(long)); + printf("long long: %lu bytes\n", sizeof(long long)); + printf("char: %lu byte(s)\n", sizeof(char)); + printf("double: %lu bytes\n", sizeof(double)); + /* Pointers */ printf("=== Pointers ===\n"); + printf("char*: %lu bytes\n", sizeof(char*)); + printf("int*: %lu bytes\n", sizeof(int*)); + printf("long*: %lu bytes\n", sizeof(long*)); + printf("void*: %lu bytes\n", sizeof(void*)); + printf("double*: %lu bytes\n", sizeof(double*)); + /* Special value - This may have undefined results... why? */ + printf("=== Special Data Types ===\n"); + printf("void: %lu byte(s)\n", sizeof(void)); + return EXIT_SUCCESS; +} +``` + +To further illustrate why this is a problem, consider the following program. + +```c +#include +#include + +int main(int argc, char *argv[]) { + // 0x200000000 -> 8589934592 in decimal + long value = strtol("200000000", NULL, 16); + printf("value: %ld\n", value); + return EXIT_SUCCESS; +} +``` + +In libc, there exists a header `stdint.h` which has special types +defined to make sure that if you use them, nomatter what system you +are on, it can guarantee that they are the correct size. + +## Endianness + +When dealing with multi byte values and different architectures, the +**endianness** of each architecture should also be taken into +account. There are many ways to detect what endianness your machine +is, for example: + +```c +#include +#include + +int main(int argc, char *argv[]) { + unsigned int i = 1; + char *c = (char*)&i; // Convert the LSB into a character + if(*c) { + printf("little endian\n"); + } else { + printf("big endian\n"); + } + return EXIT_SUCCESS; +} +``` + +Can you think of why this works? Could you explain it if asked on an exam? + +## Assembly + +During the compilation process, a C program is translated to an +assembly source file. This is important because it is possible that +something which has great performance in one system could have +terrible performance in another with the exact same C implementation, +in this case, the programmer has to inspect the assembly code for +more information. + +Example: + +```c +// asm.c +#include +#include +#include +#include +int main(int argc, char *argv[]) { + char buffer[1024]; + // Get user input + fgets(buffer, 1024, stdin); + int64_t value = strtoll(buffer, NULL, 10); + printf("You entered %" PRId64 "\n", value); + return EXIT_SUCCESS; +} +``` + +Test the program with 32-bit binaries vs 64-bit binaries. To be able +to compile a 32-bit binary on a 64-bit machine, utilize the `-m32` +flag provided by gcc-multilib (installed during HW0). Here is how to +compile each program respectively: + +``` +$ gcc -Wall -Werror -m32 asm.c -o 32.out +$ gcc -Wall -Werror -m64 asm.c -o 64.out +``` + +Run each program and you should see this output: + +``` +$ ./64.out +75 +You entered 75 +$ ./32.out +75 +You entered 75 +``` + + > 75 is a value that is entered by the user. You can enter any number you choose. + +Notice, even though both programs are compiled for different +architectures, they still produce the same results.These programs are +assembled using different instruction sets though. To see this compile +the programs with the `-S` flag. This flag will store the intermediate +assembly of the program in a `.s` file. + +For the 64-bit program run: + +``` +$ gcc -Wall -Werror -m64 -S asm.c +``` + +Take a look at `asm.s` which was just generated in the **current working directory**. + +``` +# x86-64 assembly for asm.c + .file "asm.c" + .section .rodata +.LC0: + .string "You entered %ld\n" + .text .globl main + .type main, @function +main: +.LFB2: + .cfi_startproc + pushq %rbp + .cfi_def_cfa_offset 16 + .cfi_offset 6, -16 + movq %rsp, %rbp + .cfi_def_cfa_register 6 + subq $1072, %rsp + movl %edi, -1060(%rbp) + movq %rsi, -1072(%rbp) + movq %fs:40, %rax + movq %rax, -8(%rbp) + xorl %eax, %eax + movq stdin(%rip), %rdx + leaq -1040(%rbp), %rax + movl $1024, %esi + movq %rax, %rdi + call fgets + leaq -1040(%rbp), %rax + movl $10, %edx + movl $0, %esi + movq %rax, %rdi + call strtoll + movq %rax, -1048(%rbp) + movq -1048(%rbp), %rax + movq %rax, %rsi + movl $.LC0, %edi + movl $0, %eax + call printf + movl $0, %eax + movq -8(%rbp), %rcx + xorq %fs:40, %rcx + je .L3 + call __stack_chk_fail +.L3: + leave + .cfi_def_cfa 7, 8 + ret + .cfi_endproc +.LFE2: + .size main, .-main + .ident "GCC: (Ubuntu 5.2.1-22ubuntu2) 5.2.1 20151010" + .section .note.GNU-stack,"",@progbits +``` + +Now compile it for x86 using the following command: + +``` +$ gcc -Wall -Werror -m32 -S asm.c +``` + +Again, take a look at `asm.s` which was just generated in current working directory. + +``` +# x86 assembly for asm.c + .file "asm.c" + .section .rodata +.LC0: + .string "You entered %lld\n" + .text .globl main + .type main, @function +main:.LFB2: + .cfi_startproc + leal 4(%esp), %ecx + .cfi_def_cfa 1, 0 + andl $-16, %esp + pushl -4(%ecx) + pushl %ebp + .cfi_escape 0x10,0x5,0x2,0x75,0 + movl %esp, %ebp + pushl %ecx + .cfi_escape 0xf,0x3,0x75,0x7c,0x6 + subl $1060, %esp + movl %ecx, %eax + movl 4(%eax), %eax + movl %eax, -1052(%ebp) + movl %gs:20, %eax + movl %eax, -12(%ebp) + xorl %eax, %eax + movl stdin, %eax + subl $4, %esp + pushl %eax + pushl $1024 + leal -1036(%ebp), %eax + pushl %eax + call fgets + addl $16, %esp + subl $4, %esp + pushl $10 + pushl $0 + leal -1036(%ebp), %eax + pushl %eax + call strtoll + addl $16, %esp + movl %eax, -1048(%ebp) + movl %edx, -1044(%ebp) + subl $4, %esp + pushl -1044(%ebp) + pushl -1048(%ebp) + pushl $.LC0 + call printf + addl $16, %esp + movl $0, %eax + movl -12(%ebp), %edx + xorl %gs:20, %edx + je .L3 + call __stack_chk_fail +.L3: + movl -4(%ebp), %ecx + .cfi_def_cfa 1, 0 + leave + .cfi_restore 5 + leal -4(%ecx), %esp + .cfi_def_cfa 4, 4 + ret + .cfi_endproc +.LFE2: + .size main, .-main + .ident "GCC: (Ubuntu 5.2.1-22ubuntu2) 5.2.1 20151010" + .section .note.GNU-stack,"",@progbits +``` + +Additionally you can log into sparky, and use the C compiler on that +machine. It will generate 32-bit SPARC assembly. + +``` +$ gcc -Wall -Werror -S asm.c +``` + +``` +# 32-bit SPARC assembly + .file "asm.c" + .section ".rodata" + .align 8 +.LLC0: + .asciz "You entered %lld\n" + .section ".text" + .align 4 + .global main + .type main, #function + .proc 04 +main: + save %sp, -1128, %sp + st %i0, [%fp+68] + st %i1, [%fp+72] + add %fp, -1032, %g1 + mov %g1, %o0 + mov 1024, %o1 + sethi %hi(__iob), %g1 + or %g1, %lo(__iob), %o2 + call fgets, 0 + nop + add %fp, -1032, %g1 + mov %g1, %o0 + mov 0, %o1 + mov 10, %o2 + call strtoll, 0 + nop + std %o0, [%fp-8] + sethi %hi(.LLC0), %g1 + or %g1, %lo(.LLC0), %o0 + ld [%fp-8], %o1 + ld [%fp-4], %o2 + call printf, 0 + nop + mov 0, %g1 + mov %g1, %i0 + return %i7+8 + nop + .size main, .-main + .ident "GCC: (GNU) 4.9.1" +``` + +## Assembly Analysis + +The assembly generated for a particular architecture varies greatly +even though it all accomplishes the exact same task on each +system. Notice that the SPARC assembly is shorter than the other two +(40 lines for SPARC, 67 lines for x86, and 51 lines for x86-64) and +that the registers used are different in all three examples. + +Take a look at how the format string in the printf call got translated: + +```c +printf("You entered %" PRId64 "\n", value); +``` + +``` +.string "You entered %ld\n" # x86-64; 64-bits +.string "You entered %lld\n" # x86; 32-bits +.asciz "You entered %lld\n" # SPARC; 32-bits +``` + +See that PRId64 got translated to different formats: `%ld` and +`%lld`. This is because the `int64_t` is translated to different types +depending on the platform to guarantee that it is at least 64-bits +wide. In the SPARC code, notice thatthere are `nop` instructions after +the call to `printf`, `strtoll`, `fgets`, and return. This is because of a +technique known as **delayed branching** used in the SPARC architecture. + +In the x86 assembly, notice `subl` and `pushl` instructions which are used +to manipulate the stack before calling functions. These instructions +are absent from the x86-64 example. This is because x86 architecture +has half the amount of registers as x86-64 architectures so the +convention is to push arguments for a function call to the stack +to compensate for this. At the core, the **Application Binary Interface** +differs between the systems. There are also various other differences +that can’t be seen by looking at the assembly such as variable sized +instruction formats, but, in general, you should just be aware that any +C code gets translated very differently depending on the machine. + +## Preprocessor + +Sometimes the easiest way to see what is happening in your program is +to just use print statements. This is a method that everyone can do +(and we know how to do!). However, we shouldn’t just put `printf` all +over our program. We do not always want to see these print outs (way +too much information for normal operation) and we don’t want to have to +comment/uncomment lines constantly. + +One possible solution to this is passing a command line argument that +turns debugging on and off. This might be an acceptable solution but it +will clutter our code with lots of if statements to check if debugging +is enabled or not, make our binary larger when we don’t want debugging +enabled, etc. Instead we will use some preprocessor tricks to give us +some logging statements when we **compile with** the flag +`-DDEBUG`. When we **compile without** the flag `-DDEBUG`, none of these +debugging statements will be printed. + +We have defined in the given Makefile a `debug` target. This compiles +your program with the `-DDEBUG` flag and `-g`, the latter of which is +necessary for gdb to work. You can simply run: + +``` +$ make clean debug +``` + +as opposed to `make clean all` to set your program up for debugging. + +Create a new header called `debug.h` and we can define each of these +macros in this header and use them in `main()` by adding `#include "debug.h"` +to `main.c`. + +debug.h: + +```c +#ifndef DEBUG_H +#define DEBUG_H +#include +#include + +#define debug(msg) printf("DEBUG: %s", msg) + +#endif +``` + +Then in your program use the debug macro + +main.c: + +```c +#include "debug.h" + +int main(int argc, char *argv[]) { + debug("Hello, World!\n"); + return EXIT_SUCCESS; +} +``` + +Then compile your program and run it. + +``` +$ make clean all +$ bin/hw1 +DEBUG: Hello, World! +``` + +Great! You just created your first **preprocessor macro**. Unfortunately +this is no better than just adding a print statement. Let's fix that! + +The preprocessor has `#if`, `#elif`, and `#else` **directives** that that we can +use to control what gets added during compilation. (Also `#endif` for +completing an if/else block) Let's create an *if* directive that will +include a section of code if `DEBUG` is defined within the preprocessor. + +debug.h: + +```c +#ifndef DEBUG_H +#define DEBUG_H +#include +#include + +#define debug(msg) printf("DEBUG: %s", msg) + +#endif +``` +main.c: + +```c +#include "debug.h" + +int main(int argc, char *argv[]) { + #ifdef DEBUG + debug("Debug flag was defined\n"); + #endif + printf("Hello, World!\n"); + return EXIT_SUCCESS; +} +``` + +When we compile this program it will check to see if `#define DEBUG` was +defined in our program. Let's test this out. + +``` +$ make clean all +$ bin/hw1 +Hello, World! +``` + +Cool the debug message didn’t print out. Now let's define `DEBUG` during +the compilation process, and run the program again. + +``` +$ make clean debug +$ bin/hw1 +DEBUG: Debug flag was defined +Hello, World! +``` + +Here you can see that debug was defined so that extra code between +`#ifdef DEBUG` and `#endif` was included. This technique will work for +certain situations, but if we have a lot of logging messages in our +program this will quickly clutter our code and make it +unreadable. Fortunately we can do better. + +Instead of doing `#ifdef DEBUG` all over our program we can instead do +`#ifdef DEBUG` around our `#define debug` macro. + +debug.h: + +```c +#ifndef DEBUG_H +#define DEBUG_H +#include +#include + +#if DEBUG + #define debug(msg) printf("DEBUG: %s", msg) +#endif + +#endif +``` + +main.c: + +```c +#include"debug.h" + +int main(int argc, char *argv[]) { + debug("Debug flag was defined\n"); + printf("Hello, World!\n"); + return EXIT_SUCCESS; +} +``` + +There is an issue with this, but let's try to compile the program. + +``` +$ make clean debug +$ bin/hw1 +DEBUG: Debug flag was defined +Hello, World! +``` + +Cool it works. Now let's try to compile it without defining `-DDEBUG`. + +``` +$ make clean all +/tmp/cc6F04VW.o: In function `main': +debug.c:(.text+0x1a): undefined reference to `debug' +collect2: error: ld returned 1 exit status +``` + +Whoops. What happened here? Well when we used `-DDEBUG` the debug macro +was defined, so it worked as expected. When we don’t compile with +`-DDEBUG` the `#define` debug is never declared in our file so it is +never substituted in our program. Since we used `debug` in the middle of +our code the preprocessor and compiler have no idea what `debug` symbol +is so it fails. Luckily this is easy to fix. We simply have to add +another case to our preprocessor if, else statement to handle this +case. + +debug.h: + +```c +#ifndef DEBUG_H +#define DEBUG_H +#include +#include + +#if DEBUG + #define debug(msg) printf("DEBUG: %s", msg) +#else + #define debug(msg) +#endif + +#endif +``` + +main.c: + +```c +#include"debug.h" + +int main(int argc, char *argv[]) { + debug("Debug flag was defined\n"); + printf("Hello, World!\n"); + return EXIT_SUCCESS; +} +``` + +Here we tell the preprocessor to replace any occurrences of `debug(msg)` +with nothing, so now when we don’t compile with `-DDEBUG`. The +preprocessor simply replaces `debug("Debug flag was defined\n")` with +an empty space. Let's compile again. + +``` +$ make clean all +$ bin/hw1 +Hello, World! +``` + +Cool. Now we can embed debug macros all over our program that look +like normal functions. There’s still a few more cool tricks we can do +to make this better.The preprocessor has a few special macros defined +called ``__LINE__``, ``__FILE__``, and ``__FUNCTION__``. These macros will be +replaced by the preprocessor to evaluate to the *line number* where the +macro is called, the *file name* that the macro is called in, and the +*function name* that the macro is called in. Let's play with this a bit. + +debug.h: + +```c +#ifndef DEBUG_H +#define DEBUG_H +#include +#include + +#ifdef DEBUG + #define debug(msg) printf("DEBUG: %s:%s:%d %s", __FILE__, __FUNCTION__, __LINE__,msg) +#else + #define debug(msg) +#endif + +#endif +``` + +main.c: + +```c +#include"debug.h" +int main(int argc, char *argv[]) { + debug("Debug flag was defined\n"); + printf("Hello, World!\n"); + return EXIT_SUCCESS; +} +``` + +Let's compile this program and run. + +``` +$ make clean debug +$ bin/hw1 +DEBUG: debug.c:main:11 Debug flag was defined +Hello, World! +``` + +As you can see all the `__FILE__`, `__FUNCTION__`, and `__LINE__` were +replaced with the corresponding values for when debug was called in the +program. Pretty cool, but we can still do even better! Normally when +we want to print something we use `printf()` and use the format +specifiers and variable arguments to print useful information. With our +current setup though we can’t do that. Fortunately for us the +preprocessor offers up a `__VA_ARGS__` macro which we can use to +accomplish this. + +> I want to point out that the syntax for this gets a bit crazy and hard +to understand (complex preprocessor stuff is a bit of a black +art). I’ll try my best to describe it but you may need to do some more +googling if the below explanation is not sufficient. + +```c +#ifndef DEBUG_H +#define DEBUG_H +#include +#include + +#ifdef DEBUG + #define debug(fmt, ...) printf("DEBUG: %s:%s:%d " fmt, __FILE__, __FUNCTION__,__LINE__, ##__VA_ARGS__) +#else + #define debug(fmt, ...) +#endif + +#endif + +#include"debug.h" + +int main(int argc, char *argv[]) { + debug("Program has %d args\n", argc); + printf("Hello, World!\n"); + return EXIT_SUCCESS; +} +``` + +First let's compile and run the program and see the results. + +``` +$ make clean debug +$ bin/hw1 +DEBUG: debug.c:main:11 Program has 1 args +Hello, World! +$ make clean all +$ bin/hw1 +Hello, World! +``` + +The macro works as expected, but let's try to explain it a bit. + +First we changed the definition of the macro to be `#define debug(fmt, ...)`. +The first argument `fmt` is the format string that we normally +define for printf and `...` is the way to declare a macro that accepts a +variable number of arguments. + +Next we have `"DEBUG: %s:%s:%d " fmt`. The C compiler can **concatenate +string literals** that are next to each other. So if `fmt` was the string +`"crazy %d concatenation"` then this statements evaluates to +`"DEBUG:%s:%s:%d crazy %d concatenation"`. Then we have our predefined +preprocessor macros that are used for the string `"DEBUG: %s:%s:%d "`, +and then we reach this next confusing statement: , +`##__VA_ARGS__`. The macro `__VA_ARGS__` will expand into the variable +arguments provided to the debug statement, but then we have this crazy +`, ##`. This is a hack for allowing no arguments to be passed to the +debug macro, Ex. `debug("I have no varargs")`. If we didn’t do this, the +previous debug statement would throw an warning/error during +the compilation process as it would expect a `__VA_ARGS__` value. + +This is one of the many interesting things we can use the C +preprocessor for. Lastly preprocessor macros are in-text replacement +before compilation, this can mean dangerous things when we are +careless about how we use them. For example it is customary to never +put a ; inside a macro definition since most programers would put a +semicolon after the macro as they would most statements. Some +programmers like to wrap the code in macros with a `do{ /*some code +here */ } while(false)` loop. They do this because if your macro is made +up of multiple statements, it will force you to add ; to all the +statements in the do while loop. Then you still have to terminate +this macro with a ; when you use it which makes it seem like a normal +function in your C code. + +Our final product will look like this: + +```c +#ifndef DEBUG_H +#define DEBUG_H +#include +#include + +#ifdef DEBUG + #define debug(fmt, ...) do{printf("DEBUG: %s:%s:%d " fmt, __FILE__, __FUNCTION__,__LINE__, ##__VA_ARGS__)}while(0) +#else + #define debug(fmt, ...) +#endif + +#endif +```